add weibo

mayunbaba2 · Feb 4, 2018 · acc7053 · acc7053
1 parent 0d09fdd
commit acc7053
Show file tree

Hide file tree

Showing 7 changed files with 347 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -71,16 +71,12 @@ server {
 
 -----
 
-2018-01-30更新：支持Tumblr导出视频和图片
+## 2018-01-30更新
 
------
-
-ps:源码核心部分的确是加密的,这里可以放出部分代码截图
-
-**恋恋影视解析部分**
-
-![](1.png)
+支持Tumblr导出视频和图片
 
-**91porn解析部分**
+## 2018-02-04更新
 
-![](2.png)
+1. 新增微博批量解析
+2. 优化爬虫
+3. 上线1.0反爬虫策略
diff --git a/app/__init__.py b/app/__init__.py
@@ -1,8 +1,23 @@
+#-*- coding=utf-8 -*-
 from flask import Flask
 from flask_sqlalchemy import SQLAlchemy
 from flask_bootstrap import Bootstrap
 #from celery import Celery,platforms
 from flask_pagedown import PageDown
+import logging
+import datetime
+
+# 日志记录
+logger = logging.getLogger("ojbk")
+logger.setLevel(logging.DEBUG)
+ch = logging.FileHandler("/root/ojbk_jiexi/logs/2mm_%(date)s.log" %
+                         {'date': datetime.datetime.now().strftime('%Y-%m-%d')})
+ch.setLevel(logging.DEBUG)
+formatter = logging.Formatter(
+    "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+ch.setFormatter(formatter)
+logger.addHandler(ch)
+
 
 app = Flask(__name__)
 app.config.from_object('config')

diff --git a/app/models.py b/app/models.py
@@ -22,15 +22,16 @@ def __repr__(self):
 
 class Context(db.Model):
     __tablename__ = 'context_table'
-    id = db.Column(db.String(64), primary_key=True)
-    urls = db.Column(db.String(200), primary_key=True)
+    uid = db.Column(db.String(64), primary_key=True)
+    pid=db.Column(db.String(200),primary_key=True)
+    urls = db.Column(db.String(200))
     isvideo = db.Column(db.Integer, default=0)  # 0=no,1=yes
     poster = db.Column(db.String(200))
     posttime = db.Column(db.DateTime())
-    description = db.Column(db.String(200))
+    description = db.Column(db.String(500))
 
     def __init__(self, **kwargs):
         super(Context, self).__init__(**kwargs)
 
     def __repr__(self):
-        return self.id
+        return self.uid
diff --git a/app/templates/base.html b/app/templates/base.html
@@ -102,6 +102,16 @@ <h2>视频&nbsp;<a href="/"><img src="/static/img/logo.png" alt="OJBK视频解
                                 </div>
                             </td>
                         </tr>
+                        <tr>
+                            <th class="text-center">
+                                <a class="left-title" href="javascript:void(0);">微博批量解析</a>
+                            </th>
+                            <td class="bs-callout bs-callout-info">
+                                <div class="list-unstyled list-group up-list">
+                                    <a class="list-group-item get-link" href="#@美少女写真馆">@美少女写真馆</a>
+                                </div>
+                            </td>
+                        </tr>
                     </table>
                 </div>
             </div>

diff --git a/app/views.py b/app/views.py
@@ -19,11 +19,13 @@
 from app import db
 from app.models import Context
 import parser
+from . import logger
 from config import *
 from captcha import *
 
 basedir = os.path.abspath('.')
 clawer = os.path.join(basedir, 'tumblr_v2.py')
+weibo_crawler = os.path.join(basedir, 'weibo.py')
 
 #VIDEOREGEX = re.compile('http://media.tumblr.com/(.*?)_frame1')
 VIDEOREGEX = re.compile(
@@ -33,7 +35,8 @@
     '<meta property="og:image" content="(.*?)" /><meta property="og:image:height"')
 vhead = 'https://vt.tumblr.com/tumblr_%s.mp4'
 HOME = 'http://%s.tumblr.com/api/read?&num=50'
-
+headers={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"}
+ban= ['TencentCloud','Savvis','ALICLOUD','GOOGLE-CLOUD']
 
 def check(uid):
     url = HOME % uid
@@ -50,6 +53,14 @@ def check(uid):
         return False
 
 
+def nickname_to_containerid(nickname):
+    url = "https://m.weibo.com/n/{}".format(nickname)
+    response = requests.get(url=url)
+    uid_check = re.search(r'(\d{16})', response.url)
+    if uid_check:
+        return "107603" + uid_check.group(1)[-10:]
+
+
 def getmd5():
     a = md5()
     letters = string.ascii_letters + string.digits
@@ -58,15 +69,56 @@ def getmd5():
     return a.hexdigest()
 
 
+
+def getipwhois(ip):
+    url='http://tool.chinaz.com/ipwhois?q={}'.format(ip)
+    try:
+        r=requests.get(url,headers=headers,timeout=8)
+        try:
+            netname=re.findall('netname:(.*?)<br/>',r.content)[0].replace(' ','')
+        except:
+            netname=re.findall('<p>Name : (.*?)</p>',r.content)[0].replace(' ','')
+    except Exception,e:
+        print e
+        netname='home'
+    return netname
+
+
 @app.context_processor
 def form_trans():
     return dict(method='method')
 
 
+@app.before_request
+def before_request():
+    global ua
+    global ip
+    global netname
+    try:
+        ua = request.headers.get('User-Agent')
+    except:
+        ua = "null"
+    try:
+        ip = request.headers['X-Forwarded-For'].split(',')[0]
+    except:
+        ip = request.remote_addr
+    print ip
+    netname=getipwhois(ip)
+
+
+
+def log(string):
+    global ip
+    global ua
+    global netname
+    logger.info('ip:{ip},netname:{netname},UA:{ua},action:{string}'.format(ip=ip,netname=netname,ua=ua,string=string))
+
+
 @app.route('/')
 def index():
     hash_ = getmd5()
     session['hash'] = hash_
+    log('visit home page')
     return render_template('base.html', hash_=hash_)
 
 
@@ -75,19 +127,23 @@ def api():
     url = request.form.get('url')
     hash_ = request.form.get('hash')
     captcha_code = request.form.get('captcha_code')
+    if ip in ['111.231.237.241','111.230.109.198','91.121.83.61'] or sum([i.lower() in netname.lower() for i in ban])>0:
+        retdata={}
+        retdata['status'] = 'fail'
+        retdata['message'] = '机器人滚！如果不是机器人，请不要通过代理访问本站！'
+        return jsonify(retdata)
     if captcha_code is not None:
+        log('verify captcha code')
         print 'input code is :', captcha_code
         print 'session code is :', session.get('CAPTCHA')
         if captcha_code.upper() == session.get('CAPTCHA'):
             return jsonify({'captcha': 'pass'})
-    if hash_ != session.get('hash'):
-        return jsonify({'captcha': 'ok'})
-    if hash_ is None:
-        return jsonify({'captcha': 'ok'})
-    if request.headers['User-Agent'] is None or 'python' in request.headers['User-Agent'].lower():
+    if hash_ != session.get('hash') or hash_ is None or request.headers['User-Agent'] is None or 'python' in request.headers['User-Agent'].lower():
+        log('may be a crawler!!! url {}'.format(url))
         return jsonify({'captcha': 'ok'})
     else:
         retdata = {}
+        log('fetch url {}'.format(url))
         # tumblr单个视频解析
         if 'tumblr.com/post' in url:
             try:
@@ -120,7 +176,7 @@ def api():
                 retdata['message'] = '解析失败，请联系站长解决'
                 return jsonify(retdata)
         # tumblr批量解析
-        if 'tumblr.com' in url:
+        elif 'tumblr.com' in url:
             id = re.findall('://(.*?)\.', url)[0]
             if check(id):
                 is_exists = ID.query.filter_by(id=id).first()
@@ -151,7 +207,7 @@ def api():
                     retdata['html'] += ' | <a href="/download?id={}&type=picture" class="btn btn-primary" role="button" title="导出图片">导出图片 <span class="glyphicon glyphicon-picture"></span></a>'.format(
                         id)
                     videos = Context.query.filter_by(
-                        id=id, isvideo=1).order_by(Context.posttime.desc()).limit(50).all()
+                        uid=id, isvideo=1).order_by(Context.posttime.desc()).limit(50).all()
                     for video in videos:
                         retdata.setdefault('video', []).append(
                             {'url': video.urls, 'desc': video.description, 'thumb': video.poster})
@@ -161,7 +217,42 @@ def api():
                 retdata['status'] = 'fail'
                 retdata['message'] = '解析失败，请联系站长解决'
                 return jsonify(retdata)
-        # 2mm
+        elif url.startswith('@'):
+            id = nickname_to_containerid(url.replace('@', ''))
+            print 'weibo\'s containerid:{}'.format(id)
+            is_exists = ID.query.filter_by(id=id).first()
+            if is_exists is None:
+                now = datetime.now()
+                inserttime = now.strftime('%Y%m%d %H:%M:%S')
+                a = ID(id=id, updateTime=inserttime, parseTimes=1)
+                db.session.add(a)
+                db.session.commit()
+                retdata['status'] = 'fail'
+                retdata['message'] = '正在解析，请稍等15s再试！'
+                subprocess.Popen('python {clawer} {id}'.format(
+                    clawer=weibo_crawler, id=id), shell=True)
+                return jsonify(retdata)
+            else:
+                now = datetime.now()
+                is_exists.updateTime = now.strftime('%Y%m%d %H:%M:%S')
+                is_exists.parseTimes += 1
+                db.session.add(is_exists)
+                db.session.commit()
+                subprocess.Popen('python {clawer} {id}'.format(
+                    clawer=weibo_crawler, id=id), shell=True)
+                retdata['status'] = 'ok'
+                retdata['total'] = 50
+                retdata['pages'] = 2
+                retdata['html'] = '<a href="/download?id={}&type=video" class="btn btn-primary" role="button" title="导出视频">导出视频 <span class="glyphicon glyphicon-film"></span></a>'.format(
+                    id)
+                retdata['html'] += ' | <a href="/download?id={}&type=picture" class="btn btn-primary" role="button" title="导出图片">导出图片 <span class="glyphicon glyphicon-picture"></span></a>'.format(
+                    id)
+                videos = Context.query.filter_by(
+                    uid=id, isvideo=1).order_by(Context.posttime.desc()).limit(50).all()
+                for video in videos:
+                    retdata.setdefault('video', []).append(
+                        {'url': video.urls, 'desc': video.description, 'thumb': video.poster})
+                return jsonify(retdata)
         else:
             try:
                 video, title, picture = parser.main(url)
@@ -182,11 +273,12 @@ def api():
 def download():
     id = request.args.get('id')
     type = request.args.get('type')
+    log('download from {} {}'.format(id,type))
     if type == 'video':
         isvideo = 1
     else:
         isvideo = 0
-    query_result = Context.query.filter_by(id=id, isvideo=isvideo).all()
+    query_result = Context.query.filter_by(uid=id, isvideo=isvideo).order_by(Context.posttime.desc()).all()
     if len(query_result) <> 0:
         content = ''
         for line in query_result:

diff --git a/tumblr_v2.py b/tumblr_v2.py
@@ -64,21 +64,22 @@ def parse_post(post):
     global pic_links
     posttime = time.localtime(post['unix-timestamp'])
     desc = post['slug']
+    pid=post['id']
     if post.has_key('video-player'):
         videosource = post['video-player']
         poster = re.findall("poster='(.*?)'", videosource)[0]
         vid = re.findall(
             '''poster='.*?[\w\W]*?/tumblr_(.*?)_.*?''', videosource)[0]
         video = vhead.format(vid)
-        video_links.append((desc, posttime, poster, video))
+        video_links.append((pid,desc, posttime, poster, video))
     if post.has_key('photo-caption'):
         if len(post['photos']) == 0:
             picture = post['photo-url-1280']
-            pic_links.append((desc, posttime, picture))
+            pic_links.append((pid,desc, posttime, picture))
         else:
             for pic in post['photos']:
                 picture = pic['photo-url-1280']
-                pic_links.append((desc, posttime, picture))
+                pic_links.append((pid,desc, posttime, picture))
 
 
 def parse_page(url):
@@ -91,21 +92,20 @@ def parse_page(url):
 
 
 def write(name):
-    videos = [(i[0], i[1], i[2], i[3].replace('/480', ''))
-              for i in video_links]
+    videos = video_links
     pictures = pic_links
     for url in videos:
-        desc, posttime, poster, video = url
-        data = Context.query.filter_by(id=name, urls=video).first()
-        if not data:
-            data = Context(id=name, urls=video, isvideo=1,
+        pid,desc, posttime, poster, video = url
+        data = Context.query.filter_by(uid=name, pid=pid).first()
+        if data is None:
+            data = Context(uid=name,pid=pid, urls=video, isvideo=1,
                            poster=poster, posttime=posttime, description=desc)
             db.session.add(data)
     for url in pictures:
-        desc, posttime, picture = url
-        dat = Context.query.filter_by(id=name, urls=picture).first()
-        if not dat:
-            data = Context(id=name, urls=picture, isvideo=0,
+        pid,desc, posttime, picture = url
+        dat = Context.query.filter_by(uid=name, pid=pid).first()
+        if dat is None:
+            data = Context(uid=name,pid=pid, urls=picture, isvideo=0,
                            poster=picture, posttime=posttime, description=desc)
             db.session.add(data)
     db.session.commit()