Skip to content

Commit d0968bf

Browse files
author
sxs149331
committed
Changes mongo server IP
1 parent 5d2e97e commit d0968bf

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+5810
-2
lines changed

Analysis.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
import json
22
import requests
33

4+
from difflib import SequenceMatcher
5+
6+
def similar(a, b):
7+
return SequenceMatcher(None, a, b).ratio()
48

59
url = "http://eventdata.utdallas.edu/api/data?api_key=EmNc8Pbp5XEUIuzlIdxqVlP5g6S1KlNe&query={\"date8\":{\"$gt\":\"20180228\", \"$lt\": \"20180401\"}}"
610

@@ -41,6 +45,9 @@
4145
root_code_not_found = 0
4246
event_match = 0
4347
doc_count = 0
48+
output_file = open("events.txt", "w+")
49+
50+
similar_count = {}
4451

4552
for doc_id in document_to_event_map:
4653
if len(document_to_event_map[doc_id]) == 2:
@@ -64,7 +71,7 @@
6471

6572
response = requests.get(url)
6673

67-
print response.content
74+
#print response.content
6875

6976
data = json.loads(response.content)
7077
sentences = data['data']
@@ -80,15 +87,25 @@
8087
print sent1_id, ":", sent1
8188
print sent2_id, ":", sent2
8289
print events[0]['source'], events[0]['target'], events[0]['code']
90+
val = int(round(10*similar(sent1, sent2)))
91+
if val not in similar_count:
92+
similar_count[val] = 0
93+
similar_count[val] = similar_count[val] + 1
8394

95+
from newsplease import NewsPlease
8496

97+
article = NewsPlease.from_url(events[0]['url'])
98+
print events[0]['url']
99+
print(article.text)
85100

86101
doc_count += 1
87102

88103
print doc_count
89104
print root_code_match
90105
print event_match
91106

107+
print similar_count
108+
92109

93110

94111

Analysis2.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
import json
2+
import requests
3+
import sys
4+
reload(sys)
5+
sys.setdefaultencoding("utf-8")
6+
7+
from difflib import SequenceMatcher
8+
9+
def similar(a, b):
10+
return SequenceMatcher(None, a, b).ratio()
11+
12+
url = "http://eventdata.utdallas.edu/api/data?api_key=EmNc8Pbp5XEUIuzlIdxqVlP5g6S1KlNe&query={\"date8\":{\"$gt\":\"20180228\", \"$lt\": \"20180401\"}}"
13+
14+
response = requests.get(url)
15+
16+
print response
17+
18+
data = json.loads(response.content)
19+
20+
#print response.content
21+
22+
print "Data Loading Complete. Entry count ", len(data["data"])
23+
24+
document_to_event_map = {}
25+
26+
for event in data['data']:
27+
28+
doc_id = event["id"].split("_")[0]
29+
if doc_id not in document_to_event_map:
30+
document_to_event_map[doc_id] = []
31+
32+
document_to_event_map[doc_id].append(event)
33+
34+
35+
print len(document_to_event_map)
36+
37+
count_map = {}
38+
39+
for doc in document_to_event_map:
40+
if len(document_to_event_map[doc]) not in count_map:
41+
count_map[len(document_to_event_map[doc])] = 0
42+
count_map[len(document_to_event_map[doc])] += 1
43+
44+
45+
print count_map
46+
47+
root_code_match = 0
48+
root_code_not_found = 0
49+
event_match = 0
50+
doc_count = 0
51+
output_file = open("events.txt", "w+")
52+
53+
similar_count = {}
54+
with open("output.txt", "w+") as out:
55+
for doc_id in document_to_event_map:
56+
if len(document_to_event_map[doc_id]) == 3:
57+
events = document_to_event_map[doc_id]
58+
#print events[0]
59+
if 'source' not in events[0] or 'target' not in events[0]:
60+
continue
61+
62+
if 'source' not in events[1] or 'target' not in events[1]:
63+
continue
64+
65+
if 'source' not in events[2] or 'target' not in events[2]:
66+
continue
67+
68+
url = "http://eventdata.utdallas.edu/api/article?api_key=EmNc8Pbp5XEUIuzlIdxqVlP5g6S1KlNe&doc_id=" + doc_id
69+
70+
response = requests.get(url)
71+
72+
data = json.loads(response.content)
73+
sentences = data['data']
74+
75+
for i in range(0, len(events)):
76+
print >> out, events[i]['code'], events[i]['source'], events[i]['target']
77+
sent_id = events[i]['id'].split("_")[1]
78+
j = 0
79+
while sent_id != str(sentences[j]['sentence_id']):
80+
j += 1
81+
print >> out, sent_id, ": ", sentences[j]["sentence"]
82+
83+
from newsplease import NewsPlease
84+
85+
print >> out, "================= FULL ARTICLE ==============="
86+
article = NewsPlease.from_url(events[0]['url'])
87+
print >> out, events[0]['url']
88+
print >> out, article.text
89+
90+
doc_count += 1
91+
out.close()
92+
print doc_count
93+
print root_code_match
94+
print event_match
95+
96+
print similar_count
97+
98+
99+
100+
101+
102+
103+
104+
105+

app_v2.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def query_formatter(query):
129129

130130
def __get_mongo_connection():
131131
# For local debugging
132-
MONGO_SERVER_IP = "172.29.100.22"
132+
MONGO_SERVER_IP = "172.29.100.16"
133133
MONGO_PORT = "3154"
134134
MONGO_USER = "event_reader"
135135
MONGO_PSWD = "dml2016"

newsplease/NewsArticle.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
class NewsArticle(object):
2+
"""
3+
Class representing a single news article containing all the information that news-please can extract.
4+
"""
5+
authors = []
6+
date_download = None
7+
date_modify = None
8+
date_publish = None
9+
description = None
10+
filename = None
11+
image_url = None
12+
language = None
13+
localpath = None
14+
source_domain = None
15+
text = None
16+
title = None
17+
title_page = None
18+
title_rss = None
19+
url = None
20+
21+
def get_dict(self):
22+
"""
23+
Get the dict of the instance of this class.
24+
:return:
25+
"""
26+
return {
27+
'authors': self.authors,
28+
'date_download': self.date_download,
29+
'date_modify': self.date_modify,
30+
'date_publish': self.date_publish,
31+
'description': self.description,
32+
'filename': self.filename,
33+
'image_url': self.image_url,
34+
'language': self.language,
35+
'localpath': self.localpath,
36+
'source_domain': self.source_domain,
37+
'text': self.text,
38+
'title': self.title,
39+
'title_page': self.title_page,
40+
'title_rss': self.title_rss,
41+
'url': self.url
42+
}

newsplease/__init__.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
import datetime
2+
import os
3+
import sys
4+
import urllib
5+
6+
from six.moves import urllib
7+
8+
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
9+
10+
from newsplease.pipeline.extractor import article_extractor
11+
from newsplease.crawler.items import NewscrawlerItem
12+
from dotmap import DotMap
13+
from newsplease.pipeline.pipelines import ExtractedInformationStorage
14+
from newsplease.crawler.simple_crawler import SimpleCrawler
15+
16+
17+
class NewsPlease:
18+
"""
19+
Access news-please functionality via this interface
20+
"""
21+
22+
@staticmethod
23+
def from_warc(warc_record):
24+
"""
25+
Extracts relevant information from a WARC record. This function does not invoke scrapy but only uses the article
26+
extractor.
27+
:return:
28+
"""
29+
html = str(warc_record.raw_stream.read())
30+
url = warc_record.rec_headers.get_header('WARC-Target-URI')
31+
download_date = warc_record.rec_headers.get_header('WARC-Date')
32+
article = NewsPlease.from_html(html, url=url, download_date=download_date)
33+
return article
34+
35+
@staticmethod
36+
def from_html(html, url=None, download_date=None):
37+
"""
38+
Extracts relevant information from an HTML page given as a string. This function does not invoke scrapy but only
39+
uses the article extractor. If you have the original URL make sure to provide it as this helps NewsPlease
40+
to extract the publishing date and title.
41+
:param html:
42+
:param url:
43+
:return:
44+
"""
45+
extractor = article_extractor.Extractor(
46+
['newspaper_extractor', 'readability_extractor', 'date_extractor', 'lang_detect_extractor'])
47+
48+
title_encoded = ''.encode()
49+
if not url:
50+
url = ''
51+
52+
# if an url was given, we can use that as the filename
53+
filename = urllib.parse.quote_plus(url) + '.json'
54+
55+
item = NewscrawlerItem()
56+
item['spider_response'] = DotMap()
57+
item['spider_response'].body = html
58+
item['url'] = url
59+
item['source_domain'] = urllib.parse.urlparse(url).hostname.encode() if url != '' else ''.encode()
60+
item['html_title'] = title_encoded
61+
item['rss_title'] = title_encoded
62+
item['local_path'] = None
63+
item['filename'] = filename
64+
item['download_date'] = download_date
65+
item['modified_date'] = None
66+
item = extractor.extract(item)
67+
68+
tmp_article = ExtractedInformationStorage.extract_relevant_info(item)
69+
final_article = ExtractedInformationStorage.convert_to_class(tmp_article)
70+
# final_article = DotMap(tmp_article)
71+
return final_article
72+
73+
@staticmethod
74+
def from_url(url):
75+
"""
76+
Crawls the article from the url and extracts relevant information.
77+
:param url:
78+
:return: A dict containing all the information of the article. Else, None.
79+
"""
80+
articles = NewsPlease.from_urls([url])
81+
if url in articles.keys():
82+
return articles[url]
83+
else:
84+
return None
85+
86+
@staticmethod
87+
def from_urls(urls):
88+
"""
89+
Crawls articles from the urls and extracts relevant information.
90+
:param urls:
91+
:return: A dict containing given URLs as keys, and extracted information as corresponding values.
92+
"""
93+
results = {}
94+
download_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
95+
96+
if len(urls) == 0:
97+
pass
98+
elif len(urls) == 1:
99+
url = urls[0]
100+
html = SimpleCrawler.fetch_url(url)
101+
results[url] = NewsPlease.from_html(html, url, download_date)
102+
else:
103+
results = SimpleCrawler.fetch_urls(urls)
104+
for url in results:
105+
results[url] = NewsPlease.from_html(results[url], url, download_date)
106+
107+
return results
108+
109+
@staticmethod
110+
def from_file(path):
111+
"""
112+
Crawls articles from the urls and extracts relevant information.
113+
:param path: path to file containing urls (each line contains one URL)
114+
:return: A dict containing given URLs as keys, and extracted information as corresponding values.
115+
"""
116+
with open(path) as f:
117+
content = f.readlines()
118+
content = [x.strip() for x in content]
119+
urls = list(filter(None, content))
120+
121+
return NewsPlease.from_urls(urls)

0 commit comments

Comments
 (0)