Skip to content

Commit e647ddd

Browse files
author
Zoher Juzar Kachwala
committed
First Commit
1 parent 56467d8 commit e647ddd

File tree

7,933 files changed

+3554956
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

7,933 files changed

+3554956
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
""
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
""

Centralifact DB and ClaimReview Crawler Scripts/animalpoliti.csv

+1,855
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
""
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from scrapy.spiders import SitemapSpider
2+
import extruct
3+
import pandas as pd
4+
# from sqlalchemy import create_engine
5+
# from sqlalchemy.engine.url import URL
6+
# import settings
7+
class claimReviewSpider(SitemapSpider):
8+
name='claimReview-spider'
9+
USER_AGENT = 'Indiana-University-Zoher-Kachwala([email protected])'
10+
ROBOTSTXT_OBEY = True
11+
AUTOTHROTTLE_ENABLED = True
12+
sitemap_urls=['http://www.politifact.com/sitemap.xml']
13+
claimdf=pd.DataFrame()
14+
def parse(self, response):
15+
if(len(claimdf)==2):
16+
raise CloseSpider("100 ClaimReview crawled")
17+
#r=requests.get(response.url)
18+
data=extruct.extract(response.text,response.url)['microdata']
19+
selected=[properties for properties in data if properties['type']=='http://schema.org/ClaimReview']
20+
for elements in selected:
21+
print "#######################################################################"
22+
print "#######################################################################"
23+
print "#######################################################################"
24+
print "#######################################################################"
25+
print "#######################################################################"
26+
dictt=elements['properties']
27+
for key in dictt:
28+
if type(dictt[key])==list:
29+
dictt[key]=dictt[key][0]
30+
claimdf=claimdf.append(pd.io.json.json_normalize(dictt),ignore_index=True)
31+
claimdf.to_csv('yo.csv', encoding='utf-8')
32+
print "#######################################################################"
33+
print "#######################################################################"
34+
print "#######################################################################"
35+
print "#######################################################################"
36+
print "#######################################################################"
37+
#engine=create_engine(URL(**settings.DATABASE),connect_args={'charset':'utf8'})
38+
#claimdf.to_sql('claim',engine, if_exists='replace')
39+
claimdf.to_csv('politifact.csv', encoding='utf-8')
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from scrapy.spiders import SitemapSpider
2+
import extruct
3+
import pandas as pd
4+
from scrapy.exceptions import CloseSpider
5+
# from sqlalchemy import create_engine
6+
# from sqlalchemy.engine.url import URL
7+
# import settings
8+
class claimReviewSpider(SitemapSpider):
9+
name='claimReview-spider'
10+
claimdf=pd.DataFrame()
11+
USER_AGENT = 'Indiana-University-Zoher-Kachwala([email protected])'
12+
ROBOTSTXT_OBEY = True
13+
AUTOTHROTTLE_ENABLED = True
14+
sitemap_urls=['https://www.metro.se/sitemap.xml']
15+
16+
def parse(self, response):
17+
if(len(self.claimdf)==100):
18+
raise CloseSpider('100 crawled')
19+
data=extruct.extract(response.text,response.url)['microdata']
20+
selected=[properties for properties in data if properties['type']=='http://schema.org/ClaimReview']
21+
for elements in selected:
22+
print "#######################################################################"
23+
print "#######################################################################"
24+
print "#######################################################################"
25+
print "#######################################################################"
26+
print "#######################################################################"
27+
print "YOOOOOOOOOOOOOOOOO"
28+
dictt=elements['properties']
29+
for key in dictt:
30+
if type(dictt[key])==list:
31+
dictt[key]=dictt[key][0]
32+
self.claimdf=self.claimdf.append(pd.io.json.json_normalize(dictt),ignore_index=True)
33+
self.claimdf.to_csv('metro.se.csv', encoding='utf-8')
34+
print "#######################################################################"
35+
print "#######################################################################"
36+
print "#######################################################################"
37+
print "#######################################################################"
38+
print "#######################################################################"
39+
#engine=create_engine(URL(**settings.DATABASE),connect_args={'charset':'utf8'})
40+
#claimdf.to_sql('claim',engine, if_exists='replace')
41+
claimdf.to_csv('metro.se.csv', encoding='utf-8')
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
from scrapy.spiders import SitemapSpider
2+
import extruct
3+
import pandas as pd
4+
from scrapy.exceptions import CloseSpider
5+
from urlparse import urlparse
6+
# from sqlalchemy import create_engine
7+
# from sqlalchemy.engine.url import URL
8+
# import settings
9+
class claimReviewSpider(SitemapSpider):
10+
name='claimReview-spider'
11+
claimdf=pd.DataFrame()
12+
USER_AGENT = 'Indiana-University-Researcher-Zoher-Kachwala([email protected])'
13+
COOKIES_ENABLED = True
14+
ROBOTSTXT_OBEY = False
15+
AUTOTHROTTLE_ENABLED = True
16+
sitemap_urls=['https://pagellapolitica.it/sitemap.xml']
17+
#sitemap_urls=['https://www.washingtonpost.com/web-sitemap-index.xml,https://www.washingtonpost.com/news-sitemap-index.xml,https://www.washingtonpost.com/video-sitemap.xml,https://www.washingtonpost.com/real-estate/sitemap.xml,https://jobs.washingtonpost.com/sitemapindex.xml,https://www.washingtonpost.com/wp-stat/sitemaps/index.xml']
18+
def parse(self, response):
19+
#To limit the crawled claimReview items
20+
if(len(self.claimdf)==100):
21+
raise CloseSpider('######################################100 crawled#################################')
22+
#Extructing microdata or json in RDFA format
23+
data=extruct.extract(response.text,response.url)
24+
#Domain Name
25+
domain=urlparse(response.url).netloc.strip('www').strip('.com')
26+
#Selecting Microdata
27+
selected=[properties for properties in data['microdata'] if properties['type']=='http://schema.org/ClaimReview']
28+
if selected:
29+
mode='micro'
30+
else:
31+
#If micro fails, selecting JSON
32+
try:
33+
selected=[properties for properties in data['json-ld'] if properties['@type']=='ClaimReview']
34+
except KeyError:
35+
selected=[properties for properties in data['json-ld'][0]['@graph'] if properties['@type']=='ClaimReview']
36+
mode='json'
37+
for elements in selected:
38+
print "#######################################################################"
39+
print "#######################################################################"
40+
print "#######################################################################"
41+
print "#######################################################################"
42+
print "#######################################################################"
43+
print "Testing"
44+
if mode=='micro':
45+
elements=elements['properties']
46+
for key in elements:
47+
if type(elements[key])==list:
48+
elements[key]=elements[key][0]
49+
self.claimdf=self.claimdf.append(pd.io.json.json_normalize(elements),ignore_index=True)
50+
#Overwriting to the csv file after every crawl
51+
self.claimdf.to_csv(domain+'.csv', encoding='utf-8')
52+
print "#######################################################################"
53+
print "#######################################################################"
54+
print "#######################################################################"
55+
print "#######################################################################"
56+
print "#######################################################################"
57+
#engine=create_engine(URL(**settings.DATABASE),connect_args={'charset':'utf8'})
58+
#claimdf.to_sql('claim',engine, if_exists='replace')
59+
#Writing the final dataframe to CSV
60+
#claimdf.to_csv(domain+'.csv', encoding='utf-8')

0 commit comments

Comments
 (0)