-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
43 changed files
with
15,813 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
# Created by .ignore support plugin (hsz.mobi) | ||
### Python template | ||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
|
||
# C extensions | ||
*.so | ||
|
||
# Distribution / packaging | ||
.Python | ||
build/ | ||
develop-eggs/ | ||
dist/ | ||
downloads/ | ||
eggs/ | ||
.eggs/ | ||
lib/ | ||
lib64/ | ||
parts/ | ||
sdist/ | ||
var/ | ||
wheels/ | ||
*.egg-info/ | ||
.installed.cfg | ||
*.egg | ||
MANIFEST | ||
|
||
# PyInstaller | ||
# Usually these files are written by a python script from a template | ||
# before PyInstaller builds the exe, so as to inject date/other infos into it. | ||
*.manifest | ||
*.spec | ||
|
||
# Installer logs | ||
pip-log.txt | ||
pip-delete-this-directory.txt | ||
|
||
# Unit test / coverage reports | ||
htmlcov/ | ||
.tox/ | ||
.coverage | ||
.coverage.* | ||
.cache | ||
nosetests.xml | ||
coverage.xml | ||
*.cover | ||
.hypothesis/ | ||
|
||
# Translations | ||
*.mo | ||
*.pot | ||
|
||
# Django stuff: | ||
*.log | ||
.static_storage/ | ||
.media/ | ||
local_settings.py | ||
|
||
# Flask stuff: | ||
instance/ | ||
.webassets-cache | ||
|
||
# Scrapy stuff: | ||
.scrapy | ||
|
||
# Sphinx documentation | ||
docs/_build/ | ||
|
||
# PyBuilder | ||
target/ | ||
|
||
# Jupyter Notebook | ||
.ipynb_checkpoints | ||
|
||
# pyenv | ||
.python-version | ||
|
||
# celery beat schedule file | ||
celerybeat-schedule | ||
|
||
# SageMath parsed files | ||
*.sage.py | ||
|
||
# Environments | ||
.env | ||
.venv | ||
env/ | ||
venv/ | ||
ENV/ | ||
env.bak/ | ||
venv.bak/ | ||
|
||
# Spyder project settings | ||
.spyderproject | ||
.spyproject | ||
|
||
# Rope project settings | ||
.ropeproject | ||
|
||
# mkdocs documentation | ||
/site | ||
|
||
# mypy | ||
.mypy_cache/ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
from __future__ import print_function | ||
from __future__ import division | ||
from __future__ import absolute_import | ||
|
||
__author__ = 'Ziqin (Shaun) Rong' | ||
__maintainer__ = 'Ziqin (Shaun) Rong' | ||
__email__ = '[email protected]' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
__author__ = 'Ziqin (Shaun) Rong' | ||
__maintainer__ = 'Ziqin (Shaun) Rong' | ||
__email__ = '[email protected]' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
from __future__ import print_function | ||
from __future__ import division | ||
from __future__ import absolute_import | ||
|
||
import argparse | ||
|
||
from DBGater.db_singleton_mongo import SynDevAdmin | ||
|
||
__author__ = 'Ziqin (Shaun) Rong' | ||
__maintainer__ = 'Ziqin (Shaun) Rong' | ||
__email__ = '[email protected]' | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-c", type=str, help="collection name where the flag is added") | ||
args = parser.parse_args() | ||
|
||
db = SynDevAdmin.db_access() | ||
db.connect() | ||
col = db.collection(args.c) | ||
|
||
for doc in col.find(): | ||
if 'Crawled' not in doc.keys() or doc['Crawled']: | ||
col.update({'_id': doc['_id']}, {'$set': {'Crawled': False}}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
from __future__ import print_function | ||
from __future__ import division | ||
from __future__ import absolute_import | ||
|
||
import argparse | ||
|
||
from DBGater.db_singleton_mongo import SynDevAdmin | ||
|
||
__author__ = 'Ziqin (Shaun) Rong' | ||
__maintainer__ = 'Ziqin (Shaun) Rong' | ||
__email__ = '[email protected]' | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-i", type=str, help="collection from where the docs will be inserted") | ||
parser.add_argument("-o", type=str, help='collection to where the docs will be inserted') | ||
args = parser.parse_args() | ||
|
||
db = SynDevAdmin.db_access() | ||
db.connect() | ||
col_from = db.collection(args.i) | ||
col_to = db.collection(args.o) | ||
|
||
for doc in col_from.find(): | ||
doc['Published_Year'] = doc['Publish_Year'] | ||
del doc['Publish_Year'] | ||
del doc['Scraped'] | ||
doc['HTML_Crawled'] = doc['Paper_HTML_Scraped'] | ||
del doc['Paper_HTML_Scraped'] | ||
if doc['HTML_Crawled'] == True: | ||
doc['Paper_HTML_content'] = doc['Paper_HTML'] | ||
del doc['Paper_HTML'] | ||
col_to.insert_one(doc) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
from __future__ import print_function | ||
from __future__ import division | ||
from __future__ import absolute_import | ||
import argparse | ||
import json_lines | ||
from DBGater.db_singleton_mongo import SynDevAdmin | ||
|
||
__author__ = 'Ziqin (Shaun) Rong' | ||
__maintainer__ = 'Ziqin (Shaun) Rong' | ||
__email__ = '[email protected]' | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-i", type=str, help='input file path') | ||
parser.add_argument("-c", type=str, help="collection name where the json line file is inserted") | ||
args = parser.parse_args() | ||
|
||
db = SynDevAdmin.db_access() | ||
db.connect() | ||
col = db.collection(args.c) | ||
|
||
with open(args.i, 'r') as jlf: | ||
for item in json_lines.reader(jlf): | ||
col.insert_one(item) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
from __future__ import print_function | ||
from __future__ import division | ||
from __future__ import absolute_import | ||
|
||
import argparse | ||
|
||
from DBGater.db_singleton_mongo import SynDevAdmin | ||
|
||
__author__ = 'Ziqin (Shaun) Rong' | ||
__maintainer__ = 'Ziqin (Shaun) Rong' | ||
__email__ = '[email protected]' | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-c", type=str, help="collection name where error messages are removed from scraped items") | ||
args = parser.parse_args() | ||
|
||
db = SynDevAdmin.db_access() | ||
db.connect() | ||
col = db.collection(args.c) | ||
|
||
for doc in col.find(): | ||
if doc['Scraped'] and 'Error_Msg' in doc.keys(): | ||
col.update({'_id': doc['_id']}, {'$unset': {'Error_Msg': ""}}) | ||
print("Removed Error_Msg.") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
from __future__ import print_function | ||
from __future__ import division | ||
from __future__ import absolute_import | ||
|
||
import argparse | ||
|
||
from DBGater.db_singleton_mongo import SynDevAdmin | ||
|
||
__author__ = 'Ziqin (Shaun) Rong' | ||
__maintainer__ = 'Ziqin (Shaun) Rong' | ||
__email__ = '[email protected]' | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-c", type=str, help="collection name where the progress is inspected") | ||
args = parser.parse_args() | ||
|
||
db = SynDevAdmin.db_access() | ||
db.connect() | ||
col = db.collection(args.c) | ||
|
||
scraped_doc = 0 | ||
all_doc = 0 | ||
for doc in col.find(): | ||
if doc['Paper_HTML_Scraped']: | ||
scraped_doc += 1 | ||
elif doc['Paper_HTML_Scraped'] == "Server Issue": | ||
print("Doc {} not able to scrape due to server issue.".format(doc['Article_HTML_Link'])) | ||
all_doc += 1 | ||
|
||
print("All Doc: {}, Scraped Doc: {}, Scraped {}%.".format(all_doc, scraped_doc, float(scraped_doc) / all_doc * 100)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# Define here the models for your scraped items | ||
# | ||
# See documentation in: | ||
# https://doc.scrapy.org/en/latest/topics/items.html | ||
|
||
import scrapy | ||
|
||
|
||
class BorgesItem(scrapy.Item): | ||
# define the fields for your item here like: | ||
# name = scrapy.Field() | ||
pass |
Oops, something went wrong.