Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
shaunrong committed Jun 13, 2019
1 parent 16b6813 commit 14db7e7
Show file tree
Hide file tree
Showing 43 changed files with 15,813 additions and 0 deletions.
107 changes: 107 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# Created by .ignore support plugin (hsz.mobi)
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/

# Translations
*.mo
*.pot

# Django stuff:
*.log
.static_storage/
.media/
local_settings.py

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

9 changes: 9 additions & 0 deletions Borges/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

__author__ = 'Ziqin (Shaun) Rong'
__maintainer__ = 'Ziqin (Shaun) Rong'
__email__ = '[email protected]'
5 changes: 5 additions & 0 deletions Borges/db_scripts/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# -*- coding: utf-8 -*-

__author__ = 'Ziqin (Shaun) Rong'
__maintainer__ = 'Ziqin (Shaun) Rong'
__email__ = '[email protected]'
27 changes: 27 additions & 0 deletions Borges/db_scripts/add_paper_scraped_flag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

import argparse

from DBGater.db_singleton_mongo import SynDevAdmin

__author__ = 'Ziqin (Shaun) Rong'
__maintainer__ = 'Ziqin (Shaun) Rong'
__email__ = '[email protected]'


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-c", type=str, help="collection name where the flag is added")
args = parser.parse_args()

db = SynDevAdmin.db_access()
db.connect()
col = db.collection(args.c)

for doc in col.find():
if 'Crawled' not in doc.keys() or doc['Crawled']:
col.update({'_id': doc['_id']}, {'$set': {'Crawled': False}})
36 changes: 36 additions & 0 deletions Borges/db_scripts/aggregate_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

import argparse

from DBGater.db_singleton_mongo import SynDevAdmin

__author__ = 'Ziqin (Shaun) Rong'
__maintainer__ = 'Ziqin (Shaun) Rong'
__email__ = '[email protected]'


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-i", type=str, help="collection from where the docs will be inserted")
parser.add_argument("-o", type=str, help='collection to where the docs will be inserted')
args = parser.parse_args()

db = SynDevAdmin.db_access()
db.connect()
col_from = db.collection(args.i)
col_to = db.collection(args.o)

for doc in col_from.find():
doc['Published_Year'] = doc['Publish_Year']
del doc['Publish_Year']
del doc['Scraped']
doc['HTML_Crawled'] = doc['Paper_HTML_Scraped']
del doc['Paper_HTML_Scraped']
if doc['HTML_Crawled'] == True:
doc['Paper_HTML_content'] = doc['Paper_HTML']
del doc['Paper_HTML']
col_to.insert_one(doc)
27 changes: 27 additions & 0 deletions Borges/db_scripts/insert_jl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
import argparse
import json_lines
from DBGater.db_singleton_mongo import SynDevAdmin

__author__ = 'Ziqin (Shaun) Rong'
__maintainer__ = 'Ziqin (Shaun) Rong'
__email__ = '[email protected]'


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-i", type=str, help='input file path')
parser.add_argument("-c", type=str, help="collection name where the json line file is inserted")
args = parser.parse_args()

db = SynDevAdmin.db_access()
db.connect()
col = db.collection(args.c)

with open(args.i, 'r') as jlf:
for item in json_lines.reader(jlf):
col.insert_one(item)
29 changes: 29 additions & 0 deletions Borges/db_scripts/remove_scraped_error_msg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

import argparse

from DBGater.db_singleton_mongo import SynDevAdmin

__author__ = 'Ziqin (Shaun) Rong'
__maintainer__ = 'Ziqin (Shaun) Rong'
__email__ = '[email protected]'


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-c", type=str, help="collection name where error messages are removed from scraped items")
args = parser.parse_args()

db = SynDevAdmin.db_access()
db.connect()
col = db.collection(args.c)

for doc in col.find():
if doc['Scraped'] and 'Error_Msg' in doc.keys():
col.update({'_id': doc['_id']}, {'$unset': {'Error_Msg': ""}})
print("Removed Error_Msg.")

34 changes: 34 additions & 0 deletions Borges/db_scripts/scraped_progress.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

import argparse

from DBGater.db_singleton_mongo import SynDevAdmin

__author__ = 'Ziqin (Shaun) Rong'
__maintainer__ = 'Ziqin (Shaun) Rong'
__email__ = '[email protected]'


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-c", type=str, help="collection name where the progress is inspected")
args = parser.parse_args()

db = SynDevAdmin.db_access()
db.connect()
col = db.collection(args.c)

scraped_doc = 0
all_doc = 0
for doc in col.find():
if doc['Paper_HTML_Scraped']:
scraped_doc += 1
elif doc['Paper_HTML_Scraped'] == "Server Issue":
print("Doc {} not able to scrape due to server issue.".format(doc['Article_HTML_Link']))
all_doc += 1

print("All Doc: {}, Scraped Doc: {}, Scraped {}%.".format(all_doc, scraped_doc, float(scraped_doc) / all_doc * 100))
14 changes: 14 additions & 0 deletions Borges/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class BorgesItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
Loading

0 comments on commit 14db7e7

Please sign in to comment.