Skip to content

Commit

Permalink
Initial work on yt-dlp
Browse files Browse the repository at this point in the history
  • Loading branch information
angrave committed Oct 23, 2024
1 parent 5cdc50e commit 46516fc
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 29 deletions.
1 change: 1 addition & 0 deletions PythonRpcServer/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
venv/
5 changes: 3 additions & 2 deletions PythonRpcServer/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ wcwidth==0.2.13

# Not versioned
numpy
pytube # if not available, use the tar.gz package (see Dockerfile)

# No longer maintained pytube # if not available, use the tar.gz package (see Dockerfile)
yt-dlp
#Always get latest

# protobuf version 3.18.3 causes NotImplementedError("To be implemented") in PythonRpcServer/mediaprovider.py
# Likely need to coordinate updating the C# version too
Expand Down
75 changes: 52 additions & 23 deletions PythonRpcServer/youtube.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
from pytube.extract import playlist_id
# from pytube.extract import playlist_id

# from yt_dlp import YoutubeDL
import yt_dlp

import requests
from utils import encode, decode, getRandomString, download_file
from utils import getRandomString
import os
import json
from time import perf_counter
import datetime

#from pytube import YouTube
import pytube
# import pytube

from mediaprovider import MediaProvider, InvalidPlaylistInfoException

Expand Down Expand Up @@ -42,7 +47,10 @@ def get_youtube_channel(self, identifier):
print(f'get_youtube_channel({identifier})')

url = YOUTUBE_CHANNEL_BASE_URL+ identifier
channel = pytube.Channel(url)
# Use yt_dlp to create a channel,

channel = yt_dlp.Youtube(url).get_channel()
## channel.playlist_id = channel.playlist_id.replace('UC', 'UU')

playlist_id = channel.playlist_id
#according to one StackOver and one test, channels-to-playlists can also be converted with string replace UCXXXX to UUXXXX
Expand All @@ -53,26 +61,33 @@ def get_youtube_playlist(self, identifier):
try:
start_time = perf_counter()

url= YOUTUBE_PLAYLIST_BASE_URL+ identifier
url= YOUTUBE_PLAYLIST_BASE_URL + identifier
print(f"get_youtube_playlist(identifier): {url}")
playlist = pytube.Playlist(url)


ydl_opts = {
'quiet': True,
'extract_flat': 'in_playlist', # Ensure we are extracting playlist entries
'force_generic_extractor': True,
}
medias = []
for v in playlist.videos:

published_at = v.publish_date.strftime('%Y/%m/%d')
media = {
#"channelTitle": channelTitle,
"channelId": v.channel_id,
"playlistId": identifier,
"title": v.title,
"description": v.description,
"publishedAt": published_at,
"videoUrl": v.watch_url,
"videoId": v.video_id,
"createdAt": published_at
}
medias.append(media)
# Current time in YYYYMMDD format
now = datetime.datetime.now().strftime('%Y%m%d')
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(url, download=False)
for entry in info_dict.get( 'entries', []):
print(entry)
published_at = entry.get('upload_date', now)
media = {
"channelId": entry['channel_id'],
"playlistId": identifier,
"title": entry['title'],
"description": entry['description'],
"publishedAt": published_at,
"videoUrl": "https://youtube.com/watch?v="+entry['id'],
"videoId": entry['id'],
"createdAt": published_at
}
medias.append(media)
end_time = perf_counter()
print(f'Youtube playlist {identifier}: Returning {len(medias)} items. Processing time {end_time - start_time :.2f} seconds')
return medias
Expand All @@ -86,7 +101,21 @@ def download_youtube_video(self, youtubeUrl):
start_time = perf_counter()
extension = '.mp4'
filename = getRandomString(8)
filepath = pytube.YouTube(youtubeUrl).streams.filter(subtype='mp4').get_highest_resolution().download(output_path = DATA_DIRECTORY, filename = filename)
filepath =f'{DATA_DIRECTORY}/{filename}'
ydl_opts = {
'quiet': True,
'format': 'best[ext=mp4]',
'outtmpl': filepath,
'cachedir' : False,
'progress_hooks': [],
'call_home': False,
'no_color': True,
'noprogress': True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
x = ydl.download([youtubeUrl])
print(x)
#filepath = yt_dlp.YoutubeDL(ydl_opts).streams.filter(subtype='mp4').get_highest_resolution().download(output_path = DATA_DIRECTORY, filename = filename)
end_time = perf_counter()
print(f"download_youtube_video({youtubeUrl}): Done. Downloaded in {end_time - start_time :.2f} seconds")
return filepath, extension
Expand Down
7 changes: 5 additions & 2 deletions PythonRpcServer/youtube_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import youtube

def test_youtube():
def test_youtube1():
print("Test 1/2: Download playlist")
yt=youtube.YoutubeProvider()
pl=yt.get_youtube_playlist('PLBgxzZMu3GpPb35BDIU5eeopR4MhBOZw_')
Expand All @@ -17,7 +17,9 @@ def test_youtube():

assert 'STAT 385' in pl[0]['title']

def test_youtube2():
print("Test 2/2: Download video")
yt=youtube.YoutubeProvider()
onevid = yt.download_youtube_video('https://youtube.com/watch?v=DqHMh8nqCPw') # 24-72 seconds typical
print(onevid)
assert len(onevid) == 2
Expand All @@ -34,4 +36,5 @@ def test_youtube():
print("All tests completed")

if __name__ == "__main__":
test_youtube()
test_youtube1()
test_youtube2()
8 changes: 6 additions & 2 deletions pythonrpcserver.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@
COPY --from=whisperbuild /whisper.cpp/models /PythonRpcServer/models
WORKDIR /PythonRpcServer

# Don't copy any py files here, so that we don't need to re-run whisper
COPY ./PythonRpcServer/transcribe_hellohellohello.wav .
# The output of tis whisper run is used when we set MOCK_RECOGNITION=MOCK for quick testing
RUN whisper -ojf -f transcribe_hellohellohello.wav

COPY ./PythonRpcServer/requirements.txt requirements.txt
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r requirements.txt
Expand All @@ -31,8 +36,7 @@
RUN python -m grpc_tools.protoc -I . --python_out=./ --grpc_python_out=./ ct.proto

COPY ./PythonRpcServer .
# The output of this file is used when we set MOCK_RECOGNITION=MOCK for quick testing
RUN whisper -ojf -f transcribe_hellohellohello.wav


CMD [ "nice", "-n", "18", "ionice", "-c", "2", "-n", "6", "python3", "-u", "/PythonRpcServer/server.py" ]

Expand Down

0 comments on commit 46516fc

Please sign in to comment.