Skip to content

Commit

Permalink
add weibo
Browse files Browse the repository at this point in the history
  • Loading branch information
tangrela committed Feb 4, 2018
1 parent 0d09fdd commit acc7053
Show file tree
Hide file tree
Showing 7 changed files with 347 additions and 37 deletions.
16 changes: 6 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,16 +71,12 @@ server {

-----

2018-01-30更新:支持Tumblr导出视频和图片
## 2018-01-30更新

-----

ps:源码核心部分的确是加密的,这里可以放出部分代码截图

**恋恋影视解析部分**

![](1.png)
支持Tumblr导出视频和图片

**91porn解析部分**
## 2018-02-04更新

![](2.png)
1. 新增微博批量解析
2. 优化爬虫
3. 上线1.0反爬虫策略
15 changes: 15 additions & 0 deletions app/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,23 @@
#-*- coding=utf-8 -*-
from flask import Flask
from flask_sqlalchemy import SQLAlchemy
from flask_bootstrap import Bootstrap
#from celery import Celery,platforms
from flask_pagedown import PageDown
import logging
import datetime

# 日志记录
logger = logging.getLogger("ojbk")
logger.setLevel(logging.DEBUG)
ch = logging.FileHandler("/root/ojbk_jiexi/logs/2mm_%(date)s.log" %
{'date': datetime.datetime.now().strftime('%Y-%m-%d')})
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s")
ch.setFormatter(formatter)
logger.addHandler(ch)


app = Flask(__name__)
app.config.from_object('config')
Expand Down
9 changes: 5 additions & 4 deletions app/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,16 @@ def __repr__(self):

class Context(db.Model):
__tablename__ = 'context_table'
id = db.Column(db.String(64), primary_key=True)
urls = db.Column(db.String(200), primary_key=True)
uid = db.Column(db.String(64), primary_key=True)
pid=db.Column(db.String(200),primary_key=True)
urls = db.Column(db.String(200))
isvideo = db.Column(db.Integer, default=0) # 0=no,1=yes
poster = db.Column(db.String(200))
posttime = db.Column(db.DateTime())
description = db.Column(db.String(200))
description = db.Column(db.String(500))

def __init__(self, **kwargs):
super(Context, self).__init__(**kwargs)

def __repr__(self):
return self.id
return self.uid
10 changes: 10 additions & 0 deletions app/templates/base.html
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,16 @@ <h2>视频&nbsp;<a href="/"><img src="/static/img/logo.png" alt="OJBK视频解
</div>
</td>
</tr>
<tr>
<th class="text-center">
<a class="left-title" href="javascript:void(0);">微博批量解析</a>
</th>
<td class="bs-callout bs-callout-info">
<div class="list-unstyled list-group up-list">
<a class="list-group-item get-link" href="#@美少女写真馆">@美少女写真馆</a>
</div>
</td>
</tr>
</table>
</div>
</div>
Expand Down
112 changes: 102 additions & 10 deletions app/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,13 @@
from app import db
from app.models import Context
import parser
from . import logger
from config import *
from captcha import *

basedir = os.path.abspath('.')
clawer = os.path.join(basedir, 'tumblr_v2.py')
weibo_crawler = os.path.join(basedir, 'weibo.py')

#VIDEOREGEX = re.compile('http://media.tumblr.com/(.*?)_frame1')
VIDEOREGEX = re.compile(
Expand All @@ -33,7 +35,8 @@
'<meta property="og:image" content="(.*?)" /><meta property="og:image:height"')
vhead = 'https://vt.tumblr.com/tumblr_%s.mp4'
HOME = 'http://%s.tumblr.com/api/read?&num=50'

headers={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"}
ban= ['TencentCloud','Savvis','ALICLOUD','GOOGLE-CLOUD']

def check(uid):
url = HOME % uid
Expand All @@ -50,6 +53,14 @@ def check(uid):
return False


def nickname_to_containerid(nickname):
url = "https://m.weibo.com/n/{}".format(nickname)
response = requests.get(url=url)
uid_check = re.search(r'(\d{16})', response.url)
if uid_check:
return "107603" + uid_check.group(1)[-10:]


def getmd5():
a = md5()
letters = string.ascii_letters + string.digits
Expand All @@ -58,15 +69,56 @@ def getmd5():
return a.hexdigest()



def getipwhois(ip):
url='http://tool.chinaz.com/ipwhois?q={}'.format(ip)
try:
r=requests.get(url,headers=headers,timeout=8)
try:
netname=re.findall('netname:(.*?)<br/>',r.content)[0].replace(' ','')
except:
netname=re.findall('<p>Name : (.*?)</p>',r.content)[0].replace(' ','')
except Exception,e:
print e
netname='home'
return netname


@app.context_processor
def form_trans():
return dict(method='method')


@app.before_request
def before_request():
global ua
global ip
global netname
try:
ua = request.headers.get('User-Agent')
except:
ua = "null"
try:
ip = request.headers['X-Forwarded-For'].split(',')[0]
except:
ip = request.remote_addr
print ip
netname=getipwhois(ip)



def log(string):
global ip
global ua
global netname
logger.info('ip:{ip},netname:{netname},UA:{ua},action:{string}'.format(ip=ip,netname=netname,ua=ua,string=string))


@app.route('/')
def index():
hash_ = getmd5()
session['hash'] = hash_
log('visit home page')
return render_template('base.html', hash_=hash_)


Expand All @@ -75,19 +127,23 @@ def api():
url = request.form.get('url')
hash_ = request.form.get('hash')
captcha_code = request.form.get('captcha_code')
if ip in ['111.231.237.241','111.230.109.198','91.121.83.61'] or sum([i.lower() in netname.lower() for i in ban])>0:
retdata={}
retdata['status'] = 'fail'
retdata['message'] = '机器人滚!如果不是机器人,请不要通过代理访问本站!'
return jsonify(retdata)
if captcha_code is not None:
log('verify captcha code')
print 'input code is :', captcha_code
print 'session code is :', session.get('CAPTCHA')
if captcha_code.upper() == session.get('CAPTCHA'):
return jsonify({'captcha': 'pass'})
if hash_ != session.get('hash'):
return jsonify({'captcha': 'ok'})
if hash_ is None:
return jsonify({'captcha': 'ok'})
if request.headers['User-Agent'] is None or 'python' in request.headers['User-Agent'].lower():
if hash_ != session.get('hash') or hash_ is None or request.headers['User-Agent'] is None or 'python' in request.headers['User-Agent'].lower():
log('may be a crawler!!! url {}'.format(url))
return jsonify({'captcha': 'ok'})
else:
retdata = {}
log('fetch url {}'.format(url))
# tumblr单个视频解析
if 'tumblr.com/post' in url:
try:
Expand Down Expand Up @@ -120,7 +176,7 @@ def api():
retdata['message'] = '解析失败,请联系站长解决'
return jsonify(retdata)
# tumblr批量解析
if 'tumblr.com' in url:
elif 'tumblr.com' in url:
id = re.findall('://(.*?)\.', url)[0]
if check(id):
is_exists = ID.query.filter_by(id=id).first()
Expand Down Expand Up @@ -151,7 +207,7 @@ def api():
retdata['html'] += ' | <a href="/download?id={}&type=picture" class="btn btn-primary" role="button" title="导出图片">导出图片 <span class="glyphicon glyphicon-picture"></span></a>'.format(
id)
videos = Context.query.filter_by(
id=id, isvideo=1).order_by(Context.posttime.desc()).limit(50).all()
uid=id, isvideo=1).order_by(Context.posttime.desc()).limit(50).all()
for video in videos:
retdata.setdefault('video', []).append(
{'url': video.urls, 'desc': video.description, 'thumb': video.poster})
Expand All @@ -161,7 +217,42 @@ def api():
retdata['status'] = 'fail'
retdata['message'] = '解析失败,请联系站长解决'
return jsonify(retdata)
# 2mm
elif url.startswith('@'):
id = nickname_to_containerid(url.replace('@', ''))
print 'weibo\'s containerid:{}'.format(id)
is_exists = ID.query.filter_by(id=id).first()
if is_exists is None:
now = datetime.now()
inserttime = now.strftime('%Y%m%d %H:%M:%S')
a = ID(id=id, updateTime=inserttime, parseTimes=1)
db.session.add(a)
db.session.commit()
retdata['status'] = 'fail'
retdata['message'] = '正在解析,请稍等15s再试!'
subprocess.Popen('python {clawer} {id}'.format(
clawer=weibo_crawler, id=id), shell=True)
return jsonify(retdata)
else:
now = datetime.now()
is_exists.updateTime = now.strftime('%Y%m%d %H:%M:%S')
is_exists.parseTimes += 1
db.session.add(is_exists)
db.session.commit()
subprocess.Popen('python {clawer} {id}'.format(
clawer=weibo_crawler, id=id), shell=True)
retdata['status'] = 'ok'
retdata['total'] = 50
retdata['pages'] = 2
retdata['html'] = '<a href="/download?id={}&type=video" class="btn btn-primary" role="button" title="导出视频">导出视频 <span class="glyphicon glyphicon-film"></span></a>'.format(
id)
retdata['html'] += ' | <a href="/download?id={}&type=picture" class="btn btn-primary" role="button" title="导出图片">导出图片 <span class="glyphicon glyphicon-picture"></span></a>'.format(
id)
videos = Context.query.filter_by(
uid=id, isvideo=1).order_by(Context.posttime.desc()).limit(50).all()
for video in videos:
retdata.setdefault('video', []).append(
{'url': video.urls, 'desc': video.description, 'thumb': video.poster})
return jsonify(retdata)
else:
try:
video, title, picture = parser.main(url)
Expand All @@ -182,11 +273,12 @@ def api():
def download():
id = request.args.get('id')
type = request.args.get('type')
log('download from {} {}'.format(id,type))
if type == 'video':
isvideo = 1
else:
isvideo = 0
query_result = Context.query.filter_by(id=id, isvideo=isvideo).all()
query_result = Context.query.filter_by(uid=id, isvideo=isvideo).order_by(Context.posttime.desc()).all()
if len(query_result) <> 0:
content = ''
for line in query_result:
Expand Down
26 changes: 13 additions & 13 deletions tumblr_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,21 +64,22 @@ def parse_post(post):
global pic_links
posttime = time.localtime(post['unix-timestamp'])
desc = post['slug']
pid=post['id']
if post.has_key('video-player'):
videosource = post['video-player']
poster = re.findall("poster='(.*?)'", videosource)[0]
vid = re.findall(
'''poster='.*?[\w\W]*?/tumblr_(.*?)_.*?''', videosource)[0]
video = vhead.format(vid)
video_links.append((desc, posttime, poster, video))
video_links.append((pid,desc, posttime, poster, video))
if post.has_key('photo-caption'):
if len(post['photos']) == 0:
picture = post['photo-url-1280']
pic_links.append((desc, posttime, picture))
pic_links.append((pid,desc, posttime, picture))
else:
for pic in post['photos']:
picture = pic['photo-url-1280']
pic_links.append((desc, posttime, picture))
pic_links.append((pid,desc, posttime, picture))


def parse_page(url):
Expand All @@ -91,21 +92,20 @@ def parse_page(url):


def write(name):
videos = [(i[0], i[1], i[2], i[3].replace('/480', ''))
for i in video_links]
videos = video_links
pictures = pic_links
for url in videos:
desc, posttime, poster, video = url
data = Context.query.filter_by(id=name, urls=video).first()
if not data:
data = Context(id=name, urls=video, isvideo=1,
pid,desc, posttime, poster, video = url
data = Context.query.filter_by(uid=name, pid=pid).first()
if data is None:
data = Context(uid=name,pid=pid, urls=video, isvideo=1,
poster=poster, posttime=posttime, description=desc)
db.session.add(data)
for url in pictures:
desc, posttime, picture = url
dat = Context.query.filter_by(id=name, urls=picture).first()
if not dat:
data = Context(id=name, urls=picture, isvideo=0,
pid,desc, posttime, picture = url
dat = Context.query.filter_by(uid=name, pid=pid).first()
if dat is None:
data = Context(uid=name,pid=pid, urls=picture, isvideo=0,
poster=picture, posttime=posttime, description=desc)
db.session.add(data)
db.session.commit()
Expand Down
Loading

0 comments on commit acc7053

Please sign in to comment.