From 637d89a7ff7d51d5d919ec62352bc362e6d93995 Mon Sep 17 00:00:00 2001 From: Abhinav Maurya Date: Mon, 22 Feb 2016 23:03:01 -0500 Subject: [PATCH] Fix minor bug in pin scraping While using the pinterest scraper, I came across the following error: Obtained 550 results after 9 scrolls Scraping: http://www.pinterest.com/nealthegr8/detailed/ URL failed: http://www.pinterest.com/nealthegr8/detailed/ connections attempted: 1 exception message: 'NoneType' object has no attribute 'get' Traceback (most recent call last): File "pinterest.py", line 136, in process_whole_page results = process(soup) File "pinterest.py", line 420, in for pin in soup.select('div.item') File "pinterest.py", line 388, in parse_pin pin.find('a', {'class': 'pinImageWrapper'}).get('href')), AttributeError: 'NoneType' object has no attribute 'get' I am not sure if the BeautifulSoup find() function has changed since this code was written, but using the current way of selecting class in a soup i.e. class_='myclass' as an argument to the find() function solved the problem for me. --- vislab/datasets/pinterest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vislab/datasets/pinterest.py b/vislab/datasets/pinterest.py index c3de61e..48d0cf6 100644 --- a/vislab/datasets/pinterest.py +++ b/vislab/datasets/pinterest.py @@ -385,7 +385,7 @@ def parse_pin(pin, username, board_name, query): data = { 'username': username, 'pin_url': 'www.pinterest.com{}'.format( - pin.find('a', {'class': 'pinImageWrapper'}).get('href')), + pin.find('a', class_='pinImageWrapper').get('href')), 'repins_likes_url': ['www.pinterest.com{}'.format( link['href']) for link in pin.select('a.socialItem')], 'caption': caption, @@ -417,7 +417,7 @@ def scrape_pins(driver, board, pin_collection): pins = process_whole_page( driver, url, lambda soup: [parse_pin( pin, board['username'], board['board_name'], board['query']) - for pin in soup.select('div.item') + for pin in soup.select('div.pinWrapper') ]) for pin in pins: if pin_collection.find({'_id': pin['_id']}).count() == 0: