Skip to content

Commit

Permalink
modify tumblr crawler
Browse files Browse the repository at this point in the history
  • Loading branch information
tangrela committed Feb 2, 2018
1 parent 7a8368c commit 0d09fdd
Show file tree
Hide file tree
Showing 6 changed files with 167 additions and 24 deletions.
40 changes: 22 additions & 18 deletions app/models.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,36 @@
#-*- coding=utf-8 -*-
from app import db

#Tumblr
# Tumblr


class ID(db.Model):
__tablename__='id_table'
id=db.Column(db.String(64),primary_key=True)
parseTimes=db.Column(db.Integer,default=0) #解析次数
updateTime=db.Column(db.String(64)) #最近更新时间
__tablename__ = 'id_table'
id = db.Column(db.String(64), primary_key=True)
parseTimes = db.Column(db.Integer, default=0) # 解析次数
updateTime = db.Column(db.String(64)) # 最近更新时间
postnum = db.Column(db.Integer)

def __init__(self,**kwargs):
super(ID,self).__init__(**kwargs)
def __init__(self, **kwargs):
super(ID, self).__init__(**kwargs)

def __repr__(self):
return self.id

#


class Context(db.Model):
__tablename__='context_table'
id=db.Column(db.String(64),primary_key=True)
urls=db.Column(db.String(200),primary_key=True)
isvideo=db.Column(db.Integer,default=0) #0=no,1=yes
poster=db.Column(db.String(200))

def __init__(self,id,urls,isvideo,poster):
self.id=id
self.urls=urls
self.isvideo=isvideo
self.poster=poster
__tablename__ = 'context_table'
id = db.Column(db.String(64), primary_key=True)
urls = db.Column(db.String(200), primary_key=True)
isvideo = db.Column(db.Integer, default=0) # 0=no,1=yes
poster = db.Column(db.String(200))
posttime = db.Column(db.DateTime())
description = db.Column(db.String(200))

def __init__(self, **kwargs):
super(Context, self).__init__(**kwargs)

def __repr__(self):
return self.id
8 changes: 5 additions & 3 deletions app/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from captcha import *

basedir = os.path.abspath('.')
clawer = os.path.join(basedir, 'tumblr.py')
clawer = os.path.join(basedir, 'tumblr_v2.py')

#VIDEOREGEX = re.compile('http://media.tumblr.com/(.*?)_frame1')
VIDEOREGEX = re.compile(
Expand Down Expand Up @@ -84,6 +84,8 @@ def api():
return jsonify({'captcha': 'ok'})
if hash_ is None:
return jsonify({'captcha': 'ok'})
if request.headers['User-Agent'] is None or 'python' in request.headers['User-Agent'].lower():
return jsonify({'captcha': 'ok'})
else:
retdata = {}
# tumblr单个视频解析
Expand Down Expand Up @@ -149,10 +151,10 @@ def api():
retdata['html'] += ' | <a href="/download?id={}&type=picture" class="btn btn-primary" role="button" title="导出图片">导出图片 <span class="glyphicon glyphicon-picture"></span></a>'.format(
id)
videos = Context.query.filter_by(
id=id, isvideo=1).limit(50).all()
id=id, isvideo=1).order_by(Context.posttime.desc()).limit(50).all()
for video in videos:
retdata.setdefault('video', []).append(
{'url': video.urls, 'desc': '', 'thumb': video.poster})
{'url': video.urls, 'desc': video.description, 'thumb': video.poster})
return jsonify(retdata)
else:
# flash('解析失败')
Expand Down
2 changes: 1 addition & 1 deletion config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
basedir=os.path.abspath(os.path.dirname(__file__))

SECRET_KEY='SSDFDSFDFD'
#SQLALCHEMY_DATABASE_URI='mysql+pymysql://user:passwd@localhost/database'
#SQLALCHEMY_DATABASE_URI='mysql+pymysql://user:passwd@localhost/db'
SQLALCHEMY_DATABASE_URI='sqlite:///'+os.path.join(basedir,'data.sqlite')
SQLALCHEMY_TRACK_MODIFICATIONS=True
debug=True
Expand Down
Binary file removed data.sqlite
Binary file not shown.
4 changes: 2 additions & 2 deletions tumblr.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def write(name):
poster, video = url
data = Context.query.filter_by(id=name, urls=video).first()
if not data:
data = Context(name, video, 1, poster)
data = Context(id=name, urls=video, isvideo=1, poster=poster)
db.session.add(data)
else:
data = Context.query.filter_by(id=name, urls=video).first()
Expand All @@ -60,7 +60,7 @@ def write(name):
for url in list(set(pictures)):
dat = Context.query.filter_by(id=name, urls=url).first()
if not dat:
data = Context(name, url, 0, url)
data = Context(id=name, urls=url, isvideo=0, poster=url)
db.session.add(data)
else:
data = Context.query.filter_by(id=name, urls=url).first()
Expand Down
137 changes: 137 additions & 0 deletions tumblr_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# -*- coding=utf-8 -*-
import re
import os
import sys
from time import clock
import time
import json
import requests
import threading
from app import db
from app.models import Context, ID

# search for url of maxium size of a picture, which starts with '<photo-url max-width="1280">' and ends with '</photo-url>'
extractpicre = re.compile(
r'(?<=<photo-url max-width="1280">).+?(?=</photo-url>)', flags=re.S)
extractvideore = re.compile(
'''poster='(.*?)'[\w\W]*?/tumblr_(.*?)" type="video/mp4"''')

video_links = []
pic_links = []
vhead = 'https://vt.tumblr.com/tumblr_{}.mp4'
api_url = 'http://%s.tumblr.com/api/read/json?callback=tumblrBadge.listItems&num=50&start='
query_urls = []


def getpost(uid, query_urls):
import requests
url = 'http://%s.tumblr.com/api/read?&num=50' % uid
r = requests.get(url)
total = re.findall('<posts start="0" total="(.*?)">', r.content)[0]
total = int(total)
id = ID.query.filter_by(id=uid).first()
if id is None:
print uid + ' : ' + str(total)
a = [i * 50 for i in range(total / 50 + 1)]
ul = api_url % uid
for i in a:
query_url = ul + str(i)
query_urls.append(query_url)
elif id.postnum is None:
print uid + ' : ' + str(total) + ' get 2'
id.postnum = total
db.session.add(id)
db.session.commit()
a = [i * 50 for i in range(total / 50 + 1)]
ul = api_url % uid
for i in a:
query_url = ul + str(i)
query_urls.append(query_url)
elif id.postnum < total:
print uid + ' : ' + str(total) + ' renew'
id.postnum = total
db.session.add(id)
db.session.commit()
a = [i * 50 for i in range((total - id.postnum) / 50 + 1)]
ul = api_url % uid
for i in a:
query_url = ul + str(i)
query_urls.append(query_url)


def parse_post(post):
global video_links
global pic_links
posttime = time.localtime(post['unix-timestamp'])
desc = post['slug']
if post.has_key('video-player'):
videosource = post['video-player']
poster = re.findall("poster='(.*?)'", videosource)[0]
vid = re.findall(
'''poster='.*?[\w\W]*?/tumblr_(.*?)_.*?''', videosource)[0]
video = vhead.format(vid)
video_links.append((desc, posttime, poster, video))
if post.has_key('photo-caption'):
if len(post['photos']) == 0:
picture = post['photo-url-1280']
pic_links.append((desc, posttime, picture))
else:
for pic in post['photos']:
picture = pic['photo-url-1280']
pic_links.append((desc, posttime, picture))


def parse_page(url):
r = requests.get(url)
json_data = json.loads(r.content.replace(
'tumblrBadge.listItems(', '').replace(");", ''))
if len(json_data['posts']) != 0:
for post in json_data['posts']:
parse_post(post)


def write(name):
videos = [(i[0], i[1], i[2], i[3].replace('/480', ''))
for i in video_links]
pictures = pic_links
for url in videos:
desc, posttime, poster, video = url
data = Context.query.filter_by(id=name, urls=video).first()
if not data:
data = Context(id=name, urls=video, isvideo=1,
poster=poster, posttime=posttime, description=desc)
db.session.add(data)
for url in pictures:
desc, posttime, picture = url
dat = Context.query.filter_by(id=name, urls=picture).first()
if not dat:
data = Context(id=name, urls=picture, isvideo=0,
poster=picture, posttime=posttime, description=desc)
db.session.add(data)
db.session.commit()


def TumblrGet(name):
now = clock()
getpost(name, query_urls)
print '{} has {} posts'.format(name, len(query_urls))
threads = []
for url in query_urls:
t = threading.Thread(target=parse_page, args=(url,))
threads.append(t)
for t in threads:
t.start()
for t in threads:
t.join()
write(name)
print "%s parse complete, cose %.1fs" % (name, clock() - now)
print "pictures %d,videos %d" % (len(pic_links), len(video_links))


if __name__ == '__main__':
name = sys.argv[1]
name = name.strip()
# name=raw_input()
# now=clock()
TumblrGet(name)
# print u"图片%d张,视频%d部"%(len(pic_links),len(video_links))

0 comments on commit 0d09fdd

Please sign in to comment.