This repository was archived by the owner on Sep 28, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_scrapings.py
170 lines (120 loc) · 5.91 KB
/
extract_scrapings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# Custom filtration--read a .CSV file line by line and do something else to it.
import sys
import requests
from bs4 import BeautifulSoup
from utilities import COLUMNS
USER_AGENT = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
def parse_time_to_read(soup):
result = soup.find('li', attrs={'class': 'readingTime'})
if result == None:
return -1
time_string = result.text.split()[0]
return int(time_string)
def count_intro_links(soup):
# The <nav class="center-doc-outline"> is the element that precedes the page content, so we start iterating there
nav_element = soup.find('nav', attrs={'id': 'center-doc-outline'})
if nav_element == None:
return -1
next_element = nav_element.find_next_sibling()
count = 0
while next_element and next_element.name != "h2":
link_elements = next_element.find_all('a')
for link in link_elements:
# Check if we're in a selector div, whose third parent is a div with "op_single_selector".
# If so, don't count this link.
add_count = 1
try:
if link.parent.parent.parent.attrs["class"][0] == "op_single_selector":
add_count = 0
except:
pass
count += add_count
next_element = next_element.find_next_sibling()
return count
def count_code_blocks(soup, languages):
# Code blocks are <pre><code class="lang-language"> where "language" is a specific language name (case insensitive).
# If the code block is unfenced, the class is BUG BUG
count = 0
blocks = soup.find_all('pre')
# Looks for unfenced code blocks
if languages == None:
for block in blocks:
code_block = block.findNext("code")
if code_block != None and not code_block.has_key("class"):
count += 1
return count
#return sum(block.text == "" for block in blocks)
# Look for code blocks for a list of specific languages
for block in blocks:
code_block = block.findNext("code")
for language in languages:
if code_block != None and code_block.has_key("class"):
if code_block["class"][0].lower() == "lang-" + language.lower():
count += 1
return count
def extract_scrapings(input_file, output_file):
print("extract_scrapings, INFO, Starting extraction, {}".format(input_file))
with open(input_file, encoding='utf-8') as f_in:
import csv
reader = csv.reader(f_in)
headers = next(reader)
index_url = headers.index(COLUMNS["url"])
with open(output_file, 'w', encoding='utf-8', newline='') as f_out:
writer = csv.writer(f_out)
# Append the columns we'll be adding
headers.append("minutes_to_read")
headers.append("links_in_intro")
headers.append("code_blocks_python")
headers.append("code_blocks_js")
headers.append("code_blocks_java")
headers.append("code_blocks_cli")
headers.append("code_blocks_unfenced")
writer.writerow(headers)
file_count = 0
for row in reader:
url = row[index_url]
file_count += 1
if file_count % 100 == 0:
print("extract_scrapings, INFO, Files processed, {}".format(file_count))
# Go get the page content
try:
response = requests.get(url, headers=USER_AGENT)
response.raise_for_status()
except:
print("extract_scrapings, WARNING, Request failed, {}".format(url))
# Write the row with -1's so we can a failed request in the output
for i in range(0,7):
row.append(-1)
writer.writerow(row)
continue
page_text = response.text
# We need the BeautifulSoup object for multiple parsings, so create it once
soup = BeautifulSoup(page_text, 'html.parser')
time = parse_time_to_read(soup)
link_count = count_intro_links(soup)
# Note: an inconsistency in article is use of "python" or "Python" for code block
# languages, which comes through in the HTML as lang-python and lang-Python, both of
# which we must count
code_count_python = count_code_blocks(soup, ["python"])
code_count_js = count_code_blocks(soup, ["javascript", "js", "typescript", "node", "node.js"])
code_count_java = count_code_blocks(soup, ["java"])
code_count_cli = count_code_blocks(soup, ["cli", "ps", "bash", "shell"])
code_count_unfenced = count_code_blocks(soup, None)
row.append(time)
row.append(link_count)
row.append(code_count_python)
row.append(code_count_js)
row.append(code_count_java)
row.append(code_count_cli)
row.append(code_count_unfenced)
writer.writerow(row)
print("extract_scrapings, INFO, INFO, Competed extraction,, {}".format(output_file))
if __name__ == "__main__":
if len(sys.argv) == 1:
print("Usage: python extract_scrapings.py <input_file>, {}".format(input_file))
print("<input_csv_file.csv> is the output from extract_metadata.py or consolidate.py")
input_file = sys.argv[1] # File is first argument; [0] is the .py file
# Making the output filename assumes the input filename has only one .
elements = input_file.split('.')
output_file = elements[0] + '-scrapings.' + elements[1]
extract_scrapings(input_file, output_file)