-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutilities.py
148 lines (93 loc) · 4.56 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import re
import json
from urllib.parse import urlparse
from urllib.parse import urljoin
def extract_title(soup):
return soup.title.string if soup.title else None
def extract_meta_description(soup):
meta_tag = soup.find('meta', attrs={'name': 'description'})
return meta_tag['content'] if meta_tag else None
def extract_h1(soup):
return soup.h1.text if soup.h1 else None
def extract_h2(soup):
return json.dumps([h2.text for h2 in soup.find_all('h2')])
def extract_internal_links(soup, url):
internal_links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True) if urlparse(urljoin(url, a['href'])).netloc == urlparse(url).netloc]
return json.dumps(internal_links)
def extract_external_links(soup, url):
external_links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True) if urlparse(urljoin(url, a['href'])).netloc != urlparse(url).netloc]
return json.dumps(external_links)
def count_internal_links(internal_links_json):
internal_links = json.loads(internal_links_json)
return len(set(internal_links))
def count_external_links(external_links_json):
external_links = json.loads(external_links_json)
return len(set(external_links))
def extract_url_length(url):
return len(url)
def extract_title_length(title):
return len(title) if title else None
def extract_meta_description_length(meta_description):
return len(meta_description) if meta_description else None
def extract_h1_length(h1):
return len(h1) if h1 else None
def extract_h2_length(h2_json):
return json.dumps([len(i) for i in json.loads(h2_json)])
def extract_count_paragraphs(soup):
return len(soup.find_all('h2'))
def extract_response_time(response):
return response.elapsed.total_seconds()
def extract_status(response):
return response.status_code
def extract_word_count(soup):
for s in soup(['script', 'style']):
s.decompose()
words = re.findall(r'\w+', soup.get_text())
return len(words)
def extract_page_size(response):
return len(response.content)
def extract_text_ratio(soup, response):
text_content = soup.get_text()
html_content = response.content.decode('utf-8', 'ignore')
text_length = len(text_content)
html_length = len(html_content)
return text_length / html_length
def extract_canonical_url(soup):
canonical_tag = soup.find('link', attrs={'rel': 'canonical'})
return canonical_tag['href'] if canonical_tag else None
def extract_meta_robots(soup):
meta_robots_tag = soup.find('meta', attrs={'name': 'robots'})
return meta_robots_tag['content'] if meta_robots_tag else None
def extract_image_alt_attributes(soup):
return json.dumps([img['alt'] if 'alt' in img.attrs else None for img in soup.find_all('img')])
def extract_structured_data(soup):
return json.dumps([script.string for script in soup.find_all('script', attrs={'type': 'application/ld+json'})])
def extract_language_tags(soup):
lang = soup.html.get('lang')
if lang:
return lang
meta_tag = soup.find('meta', attrs={'http-equiv': 'content-language'})
if meta_tag:
content_language = meta_tag.get('content')
if content_language:
return content_language.split('-')[0]
return None
def extract_meta_keywords(soup):
meta_tag = soup.find('meta', attrs={'name': 'keyword'})
return meta_tag['content'] if meta_tag and 'content' in meta_tag.attrs else None
def generate_seo_alerts(item):
alerts = {
'alert_missing_title': item['title'] is None,
'alert_title_length': (item['title_length'] < 55 or item['title_length'] > 60) if item['title_length'] is not None else False,
'alert_missing_meta_description': item['meta_description'] is None,
'alert_meta_description_length': (item['meta_description_length'] < 155 or item['meta_description_length'] > 160) if item['meta_description_length'] is not None else False,
'alert_missing_h1': item['h1'] is None,
'alert_incorrect_canonical_url': item['canonical_url'] is None or item['canonical_url'] != item['url'],
'alert_missing_image_alt_attributes': any(alt is None for alt in json.loads(item['image_alt_attributes'])),
'alert_missing_language_tag': item['language_tags'] is None,
'alert_low_text_ratio': item['text_ratio'] < 0.15,
'alert_no_meta_robots_or_incorrect_directives': item['meta_robots'] is None or 'noindex' in item['meta_robots'].lower() or 'nofollow' in item['meta_robots'].lower(),
}
alerts['has_alert'] = any(alerts.values())
alert_names = list(alerts.keys())
return alerts, alert_names