From 46516fce8f82f06ac99a65e85b14fb195104278d Mon Sep 17 00:00:00 2001 From: Lawrence Angrave Date: Wed, 23 Oct 2024 16:00:00 -0500 Subject: [PATCH] Initial work on yt-dlp --- PythonRpcServer/.gitignore | 1 + PythonRpcServer/requirements.txt | 5 ++- PythonRpcServer/youtube.py | 75 ++++++++++++++++++++++---------- PythonRpcServer/youtube_test.py | 7 ++- pythonrpcserver.Dockerfile | 8 +++- 5 files changed, 67 insertions(+), 29 deletions(-) create mode 100644 PythonRpcServer/.gitignore diff --git a/PythonRpcServer/.gitignore b/PythonRpcServer/.gitignore new file mode 100644 index 00000000..f7275bbb --- /dev/null +++ b/PythonRpcServer/.gitignore @@ -0,0 +1 @@ +venv/ diff --git a/PythonRpcServer/requirements.txt b/PythonRpcServer/requirements.txt index a95812ba..8e8d7258 100644 --- a/PythonRpcServer/requirements.txt +++ b/PythonRpcServer/requirements.txt @@ -32,8 +32,9 @@ wcwidth==0.2.13 # Not versioned numpy -pytube # if not available, use the tar.gz package (see Dockerfile) - +# No longer maintained pytube # if not available, use the tar.gz package (see Dockerfile) +yt-dlp +#Always get latest # protobuf version 3.18.3 causes NotImplementedError("To be implemented") in PythonRpcServer/mediaprovider.py # Likely need to coordinate updating the C# version too diff --git a/PythonRpcServer/youtube.py b/PythonRpcServer/youtube.py index 45841871..ca69b7d1 100644 --- a/PythonRpcServer/youtube.py +++ b/PythonRpcServer/youtube.py @@ -1,12 +1,17 @@ -from pytube.extract import playlist_id +# from pytube.extract import playlist_id + +# from yt_dlp import YoutubeDL +import yt_dlp + import requests -from utils import encode, decode, getRandomString, download_file +from utils import getRandomString import os import json from time import perf_counter +import datetime #from pytube import YouTube -import pytube +# import pytube from mediaprovider import MediaProvider, InvalidPlaylistInfoException @@ -42,7 +47,10 @@ def get_youtube_channel(self, identifier): print(f'get_youtube_channel({identifier})') url = YOUTUBE_CHANNEL_BASE_URL+ identifier - channel = pytube.Channel(url) + # Use yt_dlp to create a channel, + + channel = yt_dlp.Youtube(url).get_channel() + ## channel.playlist_id = channel.playlist_id.replace('UC', 'UU') playlist_id = channel.playlist_id #according to one StackOver and one test, channels-to-playlists can also be converted with string replace UCXXXX to UUXXXX @@ -53,26 +61,33 @@ def get_youtube_playlist(self, identifier): try: start_time = perf_counter() - url= YOUTUBE_PLAYLIST_BASE_URL+ identifier + url= YOUTUBE_PLAYLIST_BASE_URL + identifier print(f"get_youtube_playlist(identifier): {url}") - playlist = pytube.Playlist(url) - + + ydl_opts = { + 'quiet': True, + 'extract_flat': 'in_playlist', # Ensure we are extracting playlist entries + 'force_generic_extractor': True, + } medias = [] - for v in playlist.videos: - - published_at = v.publish_date.strftime('%Y/%m/%d') - media = { - #"channelTitle": channelTitle, - "channelId": v.channel_id, - "playlistId": identifier, - "title": v.title, - "description": v.description, - "publishedAt": published_at, - "videoUrl": v.watch_url, - "videoId": v.video_id, - "createdAt": published_at - } - medias.append(media) + # Current time in YYYYMMDD format + now = datetime.datetime.now().strftime('%Y%m%d') + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + info_dict = ydl.extract_info(url, download=False) + for entry in info_dict.get( 'entries', []): + print(entry) + published_at = entry.get('upload_date', now) + media = { + "channelId": entry['channel_id'], + "playlistId": identifier, + "title": entry['title'], + "description": entry['description'], + "publishedAt": published_at, + "videoUrl": "https://youtube.com/watch?v="+entry['id'], + "videoId": entry['id'], + "createdAt": published_at + } + medias.append(media) end_time = perf_counter() print(f'Youtube playlist {identifier}: Returning {len(medias)} items. Processing time {end_time - start_time :.2f} seconds') return medias @@ -86,7 +101,21 @@ def download_youtube_video(self, youtubeUrl): start_time = perf_counter() extension = '.mp4' filename = getRandomString(8) - filepath = pytube.YouTube(youtubeUrl).streams.filter(subtype='mp4').get_highest_resolution().download(output_path = DATA_DIRECTORY, filename = filename) + filepath =f'{DATA_DIRECTORY}/{filename}' + ydl_opts = { + 'quiet': True, + 'format': 'best[ext=mp4]', + 'outtmpl': filepath, + 'cachedir' : False, + 'progress_hooks': [], + 'call_home': False, + 'no_color': True, + 'noprogress': True, + } + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + x = ydl.download([youtubeUrl]) + print(x) + #filepath = yt_dlp.YoutubeDL(ydl_opts).streams.filter(subtype='mp4').get_highest_resolution().download(output_path = DATA_DIRECTORY, filename = filename) end_time = perf_counter() print(f"download_youtube_video({youtubeUrl}): Done. Downloaded in {end_time - start_time :.2f} seconds") return filepath, extension diff --git a/PythonRpcServer/youtube_test.py b/PythonRpcServer/youtube_test.py index a7ddf125..a838004d 100644 --- a/PythonRpcServer/youtube_test.py +++ b/PythonRpcServer/youtube_test.py @@ -5,7 +5,7 @@ import youtube -def test_youtube(): +def test_youtube1(): print("Test 1/2: Download playlist") yt=youtube.YoutubeProvider() pl=yt.get_youtube_playlist('PLBgxzZMu3GpPb35BDIU5eeopR4MhBOZw_') @@ -17,7 +17,9 @@ def test_youtube(): assert 'STAT 385' in pl[0]['title'] +def test_youtube2(): print("Test 2/2: Download video") + yt=youtube.YoutubeProvider() onevid = yt.download_youtube_video('https://youtube.com/watch?v=DqHMh8nqCPw') # 24-72 seconds typical print(onevid) assert len(onevid) == 2 @@ -34,4 +36,5 @@ def test_youtube(): print("All tests completed") if __name__ == "__main__": - test_youtube() + test_youtube1() + test_youtube2() diff --git a/pythonrpcserver.Dockerfile b/pythonrpcserver.Dockerfile index dc6061fe..c3e6498b 100644 --- a/pythonrpcserver.Dockerfile +++ b/pythonrpcserver.Dockerfile @@ -23,6 +23,11 @@ COPY --from=whisperbuild /whisper.cpp/models /PythonRpcServer/models WORKDIR /PythonRpcServer + # Don't copy any py files here, so that we don't need to re-run whisper + COPY ./PythonRpcServer/transcribe_hellohellohello.wav . + # The output of tis whisper run is used when we set MOCK_RECOGNITION=MOCK for quick testing + RUN whisper -ojf -f transcribe_hellohellohello.wav + COPY ./PythonRpcServer/requirements.txt requirements.txt RUN pip install --no-cache-dir --upgrade pip && \ pip install --no-cache-dir -r requirements.txt @@ -31,8 +36,7 @@ RUN python -m grpc_tools.protoc -I . --python_out=./ --grpc_python_out=./ ct.proto COPY ./PythonRpcServer . -# The output of this file is used when we set MOCK_RECOGNITION=MOCK for quick testing - RUN whisper -ojf -f transcribe_hellohellohello.wav + CMD [ "nice", "-n", "18", "ionice", "-c", "2", "-n", "6", "python3", "-u", "/PythonRpcServer/server.py" ]