|
| 1 | +from scrapy.spiders import SitemapSpider |
| 2 | +import extruct |
| 3 | +import pandas as pd |
| 4 | +from scrapy.exceptions import CloseSpider |
| 5 | +from urlparse import urlparse |
| 6 | +# from sqlalchemy import create_engine |
| 7 | +# from sqlalchemy.engine.url import URL |
| 8 | +# import settings |
| 9 | +class claimReviewSpider(SitemapSpider): |
| 10 | + name='claimReview-spider' |
| 11 | + claimdf=pd.DataFrame() |
| 12 | + USER_AGENT = 'Indiana-University-Researcher-Zoher-Kachwala([email protected])' |
| 13 | + COOKIES_ENABLED = True |
| 14 | + ROBOTSTXT_OBEY = False |
| 15 | + AUTOTHROTTLE_ENABLED = True |
| 16 | + sitemap_urls=['https://pagellapolitica.it/sitemap.xml'] |
| 17 | + #sitemap_urls=['https://www.washingtonpost.com/web-sitemap-index.xml,https://www.washingtonpost.com/news-sitemap-index.xml,https://www.washingtonpost.com/video-sitemap.xml,https://www.washingtonpost.com/real-estate/sitemap.xml,https://jobs.washingtonpost.com/sitemapindex.xml,https://www.washingtonpost.com/wp-stat/sitemaps/index.xml'] |
| 18 | + def parse(self, response): |
| 19 | + #To limit the crawled claimReview items |
| 20 | + if(len(self.claimdf)==100): |
| 21 | + raise CloseSpider('######################################100 crawled#################################') |
| 22 | + #Extructing microdata or json in RDFA format |
| 23 | + data=extruct.extract(response.text,response.url) |
| 24 | + #Domain Name |
| 25 | + domain=urlparse(response.url).netloc.strip('www').strip('.com') |
| 26 | + #Selecting Microdata |
| 27 | + selected=[properties for properties in data['microdata'] if properties['type']=='http://schema.org/ClaimReview'] |
| 28 | + if selected: |
| 29 | + mode='micro' |
| 30 | + else: |
| 31 | + #If micro fails, selecting JSON |
| 32 | + try: |
| 33 | + selected=[properties for properties in data['json-ld'] if properties['@type']=='ClaimReview'] |
| 34 | + except KeyError: |
| 35 | + selected=[properties for properties in data['json-ld'][0]['@graph'] if properties['@type']=='ClaimReview'] |
| 36 | + mode='json' |
| 37 | + for elements in selected: |
| 38 | + print "#######################################################################" |
| 39 | + print "#######################################################################" |
| 40 | + print "#######################################################################" |
| 41 | + print "#######################################################################" |
| 42 | + print "#######################################################################" |
| 43 | + print "Testing" |
| 44 | + if mode=='micro': |
| 45 | + elements=elements['properties'] |
| 46 | + for key in elements: |
| 47 | + if type(elements[key])==list: |
| 48 | + elements[key]=elements[key][0] |
| 49 | + self.claimdf=self.claimdf.append(pd.io.json.json_normalize(elements),ignore_index=True) |
| 50 | + #Overwriting to the csv file after every crawl |
| 51 | + self.claimdf.to_csv(domain+'.csv', encoding='utf-8') |
| 52 | + print "#######################################################################" |
| 53 | + print "#######################################################################" |
| 54 | + print "#######################################################################" |
| 55 | + print "#######################################################################" |
| 56 | + print "#######################################################################" |
| 57 | + #engine=create_engine(URL(**settings.DATABASE),connect_args={'charset':'utf8'}) |
| 58 | + #claimdf.to_sql('claim',engine, if_exists='replace') |
| 59 | + #Writing the final dataframe to CSV |
| 60 | + #claimdf.to_csv(domain+'.csv', encoding='utf-8') |
0 commit comments