-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapper.py
executable file
·72 lines (54 loc) · 1.7 KB
/
scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env python3
import os
import urllib
from bs4 import BeautifulSoup
import webbrowser
from termcolor import colored, cprint
hackernews_url = "https://news.ycombinator.com/"
http_response = urllib.request.urlopen(hackernews_url)
bytes = http_response.read()
content = bytes.decode('utf-8')
#file_src will store article titles and corresponding url
file_src = open('hackernews', 'w+')
soup = BeautifulSoup(content, "lxml")
table = soup.find('table', class_ = "itemlist")
table_rows = table.find_all("tr")
i = 0
for row in table_rows:
i = i+1
if i == 89:
break
if i%3 != 1 :
continue
row_data = row.find_all("td")
row_data_element = row_data[2]
row_data_element_link = row_data_element.find("a")
file_src.write(row_data_element.find(text = True))
file_src.write("\n")
file_src.write(row_data_element_link.get("href"))
file_src.write("\n")
file_src.close()
file_src = open('hackernews', 'r')
text = file_src.read()
lines = text.split("\n")
os.remove('hackernews')
current_article = 1
serial_no = 1
print(serial_no, ".", lines[current_article - 1])
while current_article <= 57:
current_article = current_article + 2
serial_no = serial_no + 1
print(serial_no, ".", lines[current_article - 1])
current_article = 1
while 1:
user_input = input()
if (user_input == "exit"):
break
try:
next_article = int(user_input)
if (next_article >=1 and next_article <= serial_no):
current_article = 2*next_article - 1
webbrowser.open(lines[current_article], new = 2)
print("opening", colored(lines[current_article], 'red', attrs = ['bold']))
except ValueError:
continue