Initial work on yt-dlp

classtranscribe · Oct 23, 2024 · 46516fc · 46516fc
1 parent 5cdc50e
commit 46516fc
Show file tree

Hide file tree

Showing 5 changed files with 67 additions and 29 deletions.
diff --git a/PythonRpcServer/.gitignore b/PythonRpcServer/.gitignore
@@ -0,0 +1 @@
+venv/
diff --git a/PythonRpcServer/requirements.txt b/PythonRpcServer/requirements.txt
@@ -32,8 +32,9 @@ wcwidth==0.2.13
 
 # Not versioned
 numpy
-pytube     # if not available, use the tar.gz package (see Dockerfile)
-
+# No longer maintained pytube     # if not available, use the tar.gz package (see Dockerfile)
+yt-dlp
+#Always get latest
 
 # protobuf version 3.18.3 causes  NotImplementedError("To be implemented") in PythonRpcServer/mediaprovider.py
 # Likely need to coordinate updating the C# version too

diff --git a/PythonRpcServer/youtube.py b/PythonRpcServer/youtube.py
@@ -1,12 +1,17 @@
-from pytube.extract import playlist_id
+# from pytube.extract import playlist_id
+
+# from yt_dlp import YoutubeDL
+import yt_dlp
+
 import requests
-from utils import encode, decode, getRandomString, download_file
+from utils import getRandomString
 import os
 import json
 from time import perf_counter 
+import datetime
 
 #from pytube import YouTube
-import pytube
+# import pytube
 
 from mediaprovider import MediaProvider, InvalidPlaylistInfoException
 
@@ -42,7 +47,10 @@ def get_youtube_channel(self, identifier):
         print(f'get_youtube_channel({identifier})')
 
         url = YOUTUBE_CHANNEL_BASE_URL+ identifier
-        channel = pytube.Channel(url)
+        # Use yt_dlp to create a channel,
+
+        channel = yt_dlp.Youtube(url).get_channel()
+        ## channel.playlist_id = channel.playlist_id.replace('UC', 'UU')
 
         playlist_id = channel.playlist_id
         #according to one StackOver and one test, channels-to-playlists can also be converted with string replace  UCXXXX to UUXXXX
@@ -53,26 +61,33 @@ def get_youtube_playlist(self, identifier):
         try:
             start_time = perf_counter()
 
-            url= YOUTUBE_PLAYLIST_BASE_URL+ identifier
+            url= YOUTUBE_PLAYLIST_BASE_URL + identifier
             print(f"get_youtube_playlist(identifier): {url}")
-            playlist = pytube.Playlist(url)
-
+
+            ydl_opts = {
+                'quiet': True,
+                'extract_flat': 'in_playlist',  # Ensure we are extracting playlist entries
+                'force_generic_extractor': True,
+            }
             medias = []
-            for v in playlist.videos:
-
-                published_at = v.publish_date.strftime('%Y/%m/%d')
-                media = {
-                    #"channelTitle": channelTitle,
-                    "channelId": v.channel_id,
-                    "playlistId": identifier,
-                    "title": v.title,
-                    "description": v.description,
-                    "publishedAt": published_at,
-                    "videoUrl": v.watch_url,
-                    "videoId": v.video_id,
-                    "createdAt": published_at
-                }
-                medias.append(media)
+            # Current time in YYYYMMDD format
+            now = datetime.datetime.now().strftime('%Y%m%d')
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                info_dict = ydl.extract_info(url, download=False)
+                for entry in info_dict.get( 'entries', []):
+                    print(entry)
+                    published_at = entry.get('upload_date', now)
+                    media = {
+                        "channelId": entry['channel_id'],
+                        "playlistId": identifier,
+                        "title": entry['title'],
+                        "description": entry['description'],
+                        "publishedAt": published_at,
+                        "videoUrl": "https://youtube.com/watch?v="+entry['id'],
+                        "videoId": entry['id'],
+                        "createdAt": published_at
+                    }
+                    medias.append(media)
             end_time = perf_counter()
             print(f'Youtube playlist {identifier}: Returning {len(medias)} items. Processing time {end_time - start_time :.2f} seconds')
             return medias
@@ -86,7 +101,21 @@ def download_youtube_video(self, youtubeUrl):
             start_time = perf_counter()
             extension = '.mp4'
             filename = getRandomString(8)
-            filepath = pytube.YouTube(youtubeUrl).streams.filter(subtype='mp4').get_highest_resolution().download(output_path = DATA_DIRECTORY, filename = filename)
+            filepath =f'{DATA_DIRECTORY}/{filename}'
+            ydl_opts = {
+                'quiet': True,
+                'format': 'best[ext=mp4]',
+                'outtmpl': filepath,
+                'cachedir' : False,
+                'progress_hooks': [],
+                'call_home': False,
+                'no_color': True,
+                'noprogress': True,
+            }
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                x = ydl.download([youtubeUrl])
+                print(x)
+                #filepath = yt_dlp.YoutubeDL(ydl_opts).streams.filter(subtype='mp4').get_highest_resolution().download(output_path = DATA_DIRECTORY, filename = filename)
             end_time = perf_counter()
             print(f"download_youtube_video({youtubeUrl}): Done. Downloaded in {end_time - start_time :.2f} seconds")
             return filepath, extension

diff --git a/PythonRpcServer/youtube_test.py b/PythonRpcServer/youtube_test.py
@@ -5,7 +5,7 @@
 
 import youtube
 
-def test_youtube():
+def test_youtube1():
     print("Test 1/2: Download playlist")
     yt=youtube.YoutubeProvider()
     pl=yt.get_youtube_playlist('PLBgxzZMu3GpPb35BDIU5eeopR4MhBOZw_')
@@ -17,7 +17,9 @@ def test_youtube():
 
     assert 'STAT 385' in pl[0]['title']
 
+def test_youtube2():
     print("Test 2/2: Download video")
+    yt=youtube.YoutubeProvider()
     onevid = yt.download_youtube_video('https://youtube.com/watch?v=DqHMh8nqCPw') # 24-72 seconds typical
     print(onevid)
     assert len(onevid) == 2
@@ -34,4 +36,5 @@ def test_youtube():
     print("All tests completed")
 
 if __name__ == "__main__":
-    test_youtube()
+    test_youtube1()
+    test_youtube2()
diff --git a/pythonrpcserver.Dockerfile b/pythonrpcserver.Dockerfile
@@ -23,6 +23,11 @@
     COPY --from=whisperbuild /whisper.cpp/models /PythonRpcServer/models
     WORKDIR /PythonRpcServer
 
+    # Don't copy any py files here, so that we don't need to re-run whisper
+    COPY ./PythonRpcServer/transcribe_hellohellohello.wav .
+    # The output of tis whisper run is used when we set MOCK_RECOGNITION=MOCK for quick testing
+    RUN whisper -ojf -f transcribe_hellohellohello.wav
+
     COPY ./PythonRpcServer/requirements.txt requirements.txt
     RUN pip install --no-cache-dir --upgrade pip && \
         pip install --no-cache-dir -r requirements.txt
@@ -31,8 +36,7 @@
     RUN python -m grpc_tools.protoc -I . --python_out=./ --grpc_python_out=./ ct.proto
 
     COPY ./PythonRpcServer .
-# The output of this file is used when we set MOCK_RECOGNITION=MOCK for quick testing
-    RUN whisper -ojf -f transcribe_hellohellohello.wav
+
 
     CMD [ "nice", "-n", "18", "ionice", "-c", "2", "-n", "6", "python3", "-u", "/PythonRpcServer/server.py" ]