1
Bash script to download and search youtube subtitles and output clickable timestamped urls
(lemm.ee)
Here is the script.
#!/usr/bin/env bash
# Download and search youtube subs
# deps yt-dlp ,awk, perl, any one or more of either ugrep, ripgrep, grep
# usage "script youtube_url"
main() {
url="$@"
check_if_url
get_video_id
search_for_downloaded_matching_files
set_download_boolean_flag
download_subs
read_and_format_transcript_file
echo_description_file
user_search
}
# Iterate over the array and add items to the new array if they match the regex
check_if_url() {
local regex='^https://[^[:space:]]+$'
if ! [[ $url =~ $regex ]]; then
echo "Invalid input. Valid input is a url matching regex ${regex}"
exit 1
fi
}
get_video_id() {
video_id=$(echo "$url" | sed -n 's/.*v=\([^&]*\).*/\1/p')
}
search_for_downloaded_matching_files() {
# Find newest created files matching the video_id
transcript_file="$( /usr/bin/ls -t --time=creation "$PWD"/*${video_id}*\.vtt 2>/dev/null | head -n 1 )"
description_file="$( /usr/bin/ls -t --time=creation "$PWD"/*${video_id}*\.description 2>/dev/null | head -n 1 )"
}
set_download_boolean_flag() {
if [ -n "$transcript_file" ] && [ -n "$description_file" ]; then
download=0 # FALSE
else
download=1 # TRUE
fi
}
download_subs() {
if [ "$download" -eq 1 ]; then
yt-dlp --restrict-filenames --write-auto-sub --skip-download "${url}"
yt-dlp --restrict-filenames --sub-langs=eng --write-subs --skip-download "${url}"
yt-dlp --restrict-filenames --write-description --skip-download "${url}"
# Search files again since they were just downloaded
search_for_downloaded_matching_files
fi
}
read_and_format_transcript_file() {
perl_removed_dupes="$(perl -0777 -pe 's/^\d\d.*\n.*\n.*<\/c>//gm' <"${transcript_file}")"
local prefix="https://www.youtube.com/watch?v=${video_id}&t="
local suffix="s"
formated_transcript_file="$(awk -v pre="$prefix" -v suf="$suffix" '
/^([0-9]{2}:){2}[0-9]{2}\.[0-9]{3}/ {
split($1, a, /[:.]/);
$1 = pre (int(a[1]*3600 + a[2]*60 + a[3]) - 3) suf;
sub(/ --> [0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{3}/, "");
sub(/ align:start position:0%$/, "");
print;
next;
}
{
sub(/ align:start position:0%$/, "");
print;
}
' <<<"${perl_removed_dupes}")"
#CRLF for ugrep to avoid ?bug? where before lines are not all outputted
formated_transcript_file_CRLF=$(printf '%b' "$formated_transcript_file" | sed 's/$/\r/')
}
echo_description_file() {
cat "${description_file}"
}
user_search() {
echo -e "\n\n"
read -rp "Enter regex (read as raw input): " search_term
: ${app_count:=0}
if command -v ug >/dev/null 2>&1; then
echo -e "\n\n\n\n"
echo "Ugrep output"
ug --pretty=never -B2 -A1 -i -Z+-~1 -e "${search_term}" --andnot "^https?:\/\/" <<<"$formated_transcript_file_CRLF"
((app_count++))
fi
if command -v rg >/dev/null 2>&1; then
echo -e "\n\n\n\n"
echo "Ripgrep output"
rg -iP -B2 -A7 "^(?!https?:\/\/).*\K${search_term}" <<<"$formated_transcript_file"
((app_count++))
fi
if [ "$app_count" -eq 0 ]; then
echo -e "\n\n\n\n"
echo "Grep output"
grep -iP -B2 -A1 "${search_term}" <<<"$formated_transcript_file"
echo -e "\n\n"
echo "Consider installing ripgrep and ugrep for better search"
((app_count++))
fi
}
main "$@"