forked from Hiromi-nee/instagram-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
71 lines (54 loc) · 2.23 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Usage:
python app.py <username>
"""
import concurrent.futures
import json
import os
import requests
import sys
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")
class InstagramScraper:
def __init__(self, username):
self.username = username
self.numPosts = 0
self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=10)
self.future_to_item = {}
def crawl(self, max_id=None):
"""Walks through the user's media"""
url = 'http://instagram.com/' + self.username + '/media' + ('?&max_id=' + max_id if max_id is not None else '')
resp = requests.get(url)
media = json.loads(requests.get(url).text)
self.numPosts += len(media['items'])
for item in media['items']:
future = self.executor.submit(self.download, item, './' + self.username)
self.future_to_item[future] = item
sys.stdout.write('\rFound %i post(s)' % self.numPosts)
sys.stdout.flush()
if 'more_available' in media and media['more_available'] is True:
max_id = media['items'][-1]['id']
self.crawl(max_id)
def download(self, item, save_dir='./'):
"""Downloads the media file"""
if not os.path.exists(save_dir):
os.makedirs(save_dir)
item['url'] = item[item['type'] + 's']['standard_resolution']['url']
base_name = item['url'].split('/')[-1].split('?')[0]
file_path = os.path.join(save_dir, base_name)
with open(file_path, 'wb') as file:
bytes = requests.get(item['url']).content
file.write(bytes)
file_time = int(item['created_time'])
os.utime(file_path, (file_time, file_time))
if __name__ == '__main__':
username = sys.argv[1]
scraper = InstagramScraper(username)
scraper.crawl()
for future in tqdm(concurrent.futures.as_completed(scraper.future_to_item), total=len(scraper.future_to_item), desc='Downloading'):
item = scraper.future_to_item[future]
if future.exception() is not None:
print ('%r generated an exception: %s') % (item['url'], future.exception())