-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsync.py
102 lines (84 loc) · 3.51 KB
/
sync.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
'''
Loads a list of podcasts from a file and downloads them to a directory.
Author: Mike Corey
'''
import os
import hashlib
import xml.etree.ElementTree as ET
from datetime import datetime
import requests
from tqdm import tqdm
SOURCES_FILE = os.getenv('LOADCAST_SOURCES_FILE') or './sources.txt'
MAX_EPISODES = int(os.getenv('LOADCAST_DOWNLOAD_MAX') or 5)
DOWNLOADED_FILES_DESTINATION = os.getenv('LOADCAST_DOWNLOAD_DIR') or './downloaded/'
PREV_DOWNLOADED_FILES_HASHES = f'{DOWNLOADED_FILES_DESTINATION}/prev_downloaded_files.dat'
HTTP_TIMEOUT = 60
def hash_of_url(url):
'''Returns a hash of the url'''
return hashlib.md5(url.encode('utf-8')).hexdigest()
def load_an_rss_feed(url):
'''Loads an RSS feed and returns a list of shows'''
shows = []
http_response = requests.get(url, timeout=HTTP_TIMEOUT)
if http_response.status_code == 200:
tree = ET.fromstring(http_response.content)
channel_name = tree.find('channel/title').text
for item in tree.iter('item'):
title = item.find('title').text
url = item.find('enclosure').attrib['url']
description = item.find('description').text
pub_date_str = item.find('pubDate').text
pub_date = datetime.strptime(pub_date_str, '%a, %d %b %Y %H:%M:%S %z')
shows.append(
{
'title': title,
'url': url,
'description': description,
'pub_date': pub_date
}
)
return shows, channel_name
def check_if_file_exists(url):
'''Checks if a file has been downloaded before'''
if not os.path.exists(PREV_DOWNLOADED_FILES_HASHES):
open(PREV_DOWNLOADED_FILES_HASHES, 'w', encoding='utf-8').close()
return False
with open(PREV_DOWNLOADED_FILES_HASHES, 'r', encoding='utf-8') as file:
for line in file:
if hash_of_url(url) in line:
return True
return False
def download_file(url, filename):
'''Downloads a file and adds it to the list of downloaded files'''
response = requests.get(url, timeout=HTTP_TIMEOUT)
if response.status_code == 200:
with open(filename, 'wb') as file:
file.write(response.content)
with open(PREV_DOWNLOADED_FILES_HASHES, 'a', encoding='utf-8') as file:
file.write(f"{hash_of_url(url)}\n")
return True
return False
def main():
'''entry point. This loads a list of podcasts and downloads them'''
podcasts = {}
with open(SOURCES_FILE, 'r', encoding='utf-8') as file:
for line in file:
shows, channel_name = load_an_rss_feed(line.strip())
podcasts[channel_name] = shows
for podcast_name, podcast_data in podcasts.items():
print(f'Syncing {podcast_name}')
sub_directory = podcast_name.replace(' ', '_')[:64]
os.makedirs(f'{DOWNLOADED_FILES_DESTINATION}{sub_directory}', exist_ok=True)
ordered_podcasts = sorted(podcast_data, key=lambda x: x['pub_date'], reverse=True)
for show in tqdm(ordered_podcasts[:MAX_EPISODES]):
url = show['url']
new_title = f"{str(show['pub_date'].date())}"
new_title = new_title.replace(' ', '_')
fqfn = f"{DOWNLOADED_FILES_DESTINATION}{sub_directory}/{new_title}.mp3"
if not check_if_file_exists(url):
success = download_file(url, fqfn)
if success:
#removed manage id3 data.
pass
if __name__ == '__main__':
main()