-
Notifications
You must be signed in to change notification settings - Fork 345
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
update-Tumblr Crawler impl by [tumblr apiv2] #35
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,7 +9,6 @@ | |
import re | ||
import json | ||
|
||
|
||
# Setting timeout | ||
TIMEOUT = 10 | ||
|
||
|
@@ -20,11 +19,16 @@ | |
START = 0 | ||
|
||
# Numbers of photos/videos per page | ||
MEDIA_NUM = 50 | ||
MEDIA_NUM = 20 | ||
|
||
# Numbers of downloading threads concurrently | ||
THREADS = 10 | ||
|
||
# just a test apikey | ||
API_KEY = "lmvVU5ExdfFZPyGOv0gCknJ2r1UnQEIZTYAYoDhKrq7eJdCn2o" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why posted test apikey here? Seems this api_key belongs to your account. If you want to make this available for public, that's ok. We'd better provider a function For some users, they have not registered tumblr and may not take time to get registered. This is why I chose api v1 instead of v2. Anyway, this will be a good enhancement to provide v2 support. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the key I found it by google,it's not mine,just use it |
||
|
||
# enum(posts,likes) | ||
POST_TYPE="likes" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better add some doc here. For most common usage, |
||
|
||
def video_hd_match(): | ||
hd_pattern = re.compile(r'.*"hdUrl":("([^\s,]*)"|false),') | ||
|
@@ -36,6 +40,7 @@ def match(video_player): | |
return hd_match.group(2).replace('\\', '') | ||
except: | ||
return None | ||
|
||
return match | ||
|
||
|
||
|
@@ -49,6 +54,7 @@ def match(video_player): | |
return default_match.group(1) | ||
except: | ||
return None | ||
|
||
return match | ||
|
||
|
||
|
@@ -82,15 +88,16 @@ def _register_regex_match_rules(self): | |
def _handle_medium_url(self, medium_type, post): | ||
try: | ||
if medium_type == "photo": | ||
return post["photo-url"][0]["#text"] | ||
return post["url"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't see any handlings for |
||
|
||
if medium_type == "video": | ||
video_player = post["video-player"][1]["#text"] | ||
for regex_rule in self.regex_rules: | ||
matched_url = regex_rule(video_player) | ||
if matched_url is not None: | ||
return matched_url | ||
else: | ||
video_player = post["video_url"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have you referred to api v2 doc. I did not see any There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should be |
||
# for regex_rule in self.regex_rules: | ||
# matched_url = regex_rule(video_player) | ||
# if matched_url is not None: | ||
# return matched_url | ||
return video_player | ||
else: | ||
raise Exception | ||
except: | ||
raise TypeError("Unable to find the right url for downloading. " | ||
|
@@ -105,7 +112,6 @@ def _download(self, medium_type, medium_url, target_folder): | |
if not medium_name.startswith("tumblr"): | ||
medium_name = "_".join([medium_url.split("/")[-2], | ||
medium_name]) | ||
|
||
medium_name += ".mp4" | ||
|
||
file_path = os.path.join(target_folder, medium_name) | ||
|
@@ -115,7 +121,8 @@ def _download(self, medium_type, medium_url, target_folder): | |
retry_times = 0 | ||
while retry_times < RETRY: | ||
try: | ||
resp = requests.get(medium_url, | ||
headers = {'Connection': 'keep-alive'} | ||
resp = requests.get(medium_url, headers, | ||
stream=True, | ||
proxies=self.proxies, | ||
timeout=TIMEOUT) | ||
|
@@ -141,7 +148,6 @@ def _download(self, medium_type, medium_url, target_folder): | |
|
||
|
||
class CrawlerScheduler(object): | ||
|
||
def __init__(self, sites, proxies=None): | ||
self.sites = sites | ||
self.proxies = proxies | ||
|
@@ -151,8 +157,7 @@ def __init__(self, sites, proxies=None): | |
def scheduling(self): | ||
# create workers | ||
for x in range(THREADS): | ||
worker = DownloadWorker(self.queue, | ||
proxies=self.proxies) | ||
worker = DownloadWorker(self.queue,proxies=self.proxies) | ||
# Setting daemon to True will let the main thread exit | ||
# even though the workers are blocking | ||
worker.daemon = True | ||
|
@@ -163,7 +168,7 @@ def scheduling(self): | |
|
||
def download_media(self, site): | ||
self.download_photos(site) | ||
self.download_videos(site) | ||
# self.download_videos(site) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why comments this function. |
||
|
||
def download_videos(self, site): | ||
self._download_media(site, "video", START) | ||
|
@@ -185,34 +190,76 @@ def _download_media(self, site, medium_type, start): | |
if not os.path.isdir(target_folder): | ||
os.mkdir(target_folder) | ||
|
||
base_url = "http://{0}.tumblr.com/api/read?type={1}&num={2}&start={3}" | ||
# liked posts: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should move to two functions, such as |
||
if POST_TYPE=="likes": | ||
base_url = "http://api.tumblr.com/v2/blog/{0}/likes?api_key={2}&limit={3}&offset={4}" | ||
else: | ||
base_url = "http://api.tumblr.com/v2/blog/{0}/posts/{1}/?api_key={2}&limit={3}&offset={4}" | ||
start = START | ||
while True: | ||
media_url = base_url.format(site, medium_type, MEDIA_NUM, start) | ||
response = requests.get(media_url, | ||
proxies=self.proxies) | ||
media_url = base_url.format(site, medium_type, API_KEY, MEDIA_NUM, start) | ||
print("Downloading from %s.\n" % media_url) | ||
response = requests.get(media_url,proxies=self.proxies) | ||
if response.status_code == 404: | ||
print("Site %s does not exist" % site) | ||
break | ||
|
||
try: | ||
data = xmltodict.parse(response.content) | ||
posts = data["tumblr"]["posts"]["post"] | ||
for post in posts: | ||
try: | ||
# if post has photoset, walk into photoset for each photo | ||
photoset = post["photoset"]["photo"] | ||
for photo in photoset: | ||
self.queue.put((medium_type, photo, target_folder)) | ||
except: | ||
# select the largest resolution | ||
# usually in the first element | ||
self.queue.put((medium_type, post, target_folder)) | ||
try: | ||
if 200 == response.status_code: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For non-200 status code, warnings/infos should be logged/printed. |
||
data = json.loads(response.content) | ||
except ValueError: | ||
print('JSON syntax error' + response.content) | ||
return | ||
|
||
if POST_TYPE=="likes": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same above. Should move to 2 functions to handle these. |
||
posts = data["response"]["liked_posts"] | ||
self.downloadTaskPreHandlerOfLikePost(medium_type, posts, target_folder) | ||
else: | ||
posts = data["response"]["posts"] | ||
self.downloadTaskPreHandlerOfNormalPost(medium_type, posts, target_folder) | ||
start += MEDIA_NUM | ||
except KeyError: | ||
break | ||
|
||
# download post | ||
def downloadTaskPreHandlerOfNormalPost(self, medium_type, posts, target_folder): | ||
if medium_type == "photo": | ||
for post in posts: | ||
try: | ||
# if post has photoset, walk into photoset for each photo | ||
photoset = post["photos"] | ||
for photo in photoset: | ||
self.queue.put((medium_type, photo["original_size"], target_folder)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems wrong here. Please check out the api v2 doc. |
||
except: | ||
# select the largest resolution | ||
# usually in the first element | ||
self.queue.put((medium_type, post, target_folder)) | ||
elif medium_type == "video": | ||
for post in posts: | ||
try: | ||
self.queue.put((medium_type, post, target_folder)) | ||
except: | ||
# select the largest resolution | ||
# usually in the first element | ||
self.queue.put((medium_type, post, target_folder)) | ||
|
||
|
||
# download like | ||
def downloadTaskPreHandlerOfLikePost(self, medium_type, posts, target_folder): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should be refactored along with |
||
for post in posts: | ||
try: | ||
# if post has photoset, walk into photoset for each photo | ||
if post["type"] == "photo": | ||
photoset = post["photos"] | ||
for photo in photoset: | ||
self.queue.put((medium_type, photo["original_size"], target_folder)) | ||
elif post["type"] == "video": | ||
self.queue.put((medium_type, post, target_folder)) | ||
except: | ||
# select the largest resolution | ||
# usually in the first element | ||
self.queue.put((medium_type, post, target_folder)) | ||
def usage(): | ||
print("1. Please create file sites.txt under this same directory.\n" | ||
"2. In sites.txt, you can specify tumblr sites separated by " | ||
|
@@ -221,31 +268,31 @@ def usage(): | |
"Sample File Content:\nsite1,site2\n\n" | ||
"Or use command line options:\n\n" | ||
"Sample:\npython tumblr-photo-video-ripper.py site1,site2\n\n\n") | ||
print(u"未找到sites.txt文件,请创建.\n" | ||
u"请在文件中指定Tumblr站点名,并以 逗号/空格/tab/表格鍵/回车符 分割,支持多行.\n" | ||
u"保存文件并重试.\n\n" | ||
u"例子: site1,site2\n\n" | ||
u"或者直接使用命令行参数指定站点\n" | ||
u"例子: python tumblr-photo-video-ripper.py site1,site2") | ||
print(u"can not find sites.txt,please create it first.\n" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why changes Chinese to English. |
||
u"set Tumblr site name,split with comma,backspace,tab or split by line.\n" | ||
u"save site.txt and retry.\n\n" | ||
u"examples: site1,site2\n\n" | ||
u"or use command to execute\n" | ||
u"demo: python tumblr-photo-video-ripper.py site1,site2") | ||
|
||
|
||
def illegal_json(): | ||
print("Illegal JSON format in file 'proxies.json'.\n" | ||
"Please refer to 'proxies_sample1.json' and 'proxies_sample2.json'.\n" | ||
"And go to http://jsonlint.com/ for validation.\n\n\n") | ||
print(u"文件proxies.json格式非法.\n" | ||
u"请参照示例文件'proxies_sample1.json'和'proxies_sample2.json'.\n" | ||
u"然后去 http://jsonlint.com/ 进行验证.") | ||
print(u"proxies.json format illegal.\n" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same above. Please don't translate back to English. No need to have same message in same language again. |
||
u"refer to 'proxies_sample1.json' and 'proxies_sample2.json'.\n" | ||
u"you can validate in http://jsonlint.com.") | ||
|
||
|
||
def parse_sites(filename): | ||
with open(filename, "r") as f: | ||
raw_sites = f.read().rstrip().lstrip() | ||
|
||
raw_sites = raw_sites.replace("\t", ",") \ | ||
.replace("\r", ",") \ | ||
.replace("\n", ",") \ | ||
.replace(" ", ",") | ||
.replace("\r", ",") \ | ||
.replace("\n", ",") \ | ||
.replace(" ", ",") | ||
raw_sites = raw_sites.split(",") | ||
|
||
sites = list() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why deletes below content. Any reasons?