-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathword_count.py
More file actions
127 lines (103 loc) · 4.23 KB
/
word_count.py
File metadata and controls
127 lines (103 loc) · 4.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python3
# pip install requests beautifulsoup4
import re
import time
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
START_URL = "https://blog.veitheller.de/" # index page
DOMAIN = urlparse(START_URL).netloc
PAUSE_S = 0.6 # be polite to the server
MAX_INDEX_PAGES = 200 # safety cap
# Treat words as sequences of letters/digits possibly joined by ' or -
WORD_RE = re.compile(r"[0-9A-Za-zÀ-ÖØ-öø-ÿ]+(?:[-'][0-9A-Za-zÀ-ÖØ-öø-ÿ]+)*", re.UNICODE)
def get(url):
r = requests.get(url, headers={"User-Agent": "word-count/1.0 (+https://example.org)"}, timeout=20)
r.raise_for_status()
return r.text
def find_next_index_page(soup, base_url):
# Prefer rel="next" (many themes use "next" for "older posts")
link = soup.find("a", rel=lambda v: v and "next" in v.lower())
if link and link.get("href"):
return urljoin(base_url, link["href"])
# Fallback: anchor with "older" in text (cover en/de variations if needed)
for a in soup.find_all("a"):
if a.get("href") and ("older" in a.get_text(strip=True).lower() or "ältere" in a.get_text(strip=True).lower()):
return urljoin(base_url, a["href"])
return None
def extract_index_post_links(soup, base_url):
links = set()
# Best guess: article headers often hold the canonical post link
for a in soup.select("article h1 a[href], article h2 a[href]"):
links.add(urljoin(base_url, a["href"]))
# Fallback: any link under <article>
for a in soup.select("article a[href]"):
links.add(urljoin(base_url, a["href"]))
# Last resort: any on-domain .html link that isn't obviously a non-post page
for a in soup.select("a[href]"):
href = a["href"]
url = urljoin(base_url, href)
if urlparse(url).netloc != DOMAIN:
continue
if not url.endswith(".html"):
continue
low = url.lower()
if any(bad in low for bad in ("/index.html", "/archives", "/archive", "/tags", "/tag/", "/category", "/categories", "/about")):
continue
links.add(url)
return sorted(links)
def extract_article_text_words(html):
soup = BeautifulSoup(html, "html.parser")
# Prefer the main article; otherwise fall back to <main>, then body
main = soup.select_one("article") or soup.select_one("main") or soup.body
if main is None:
main = soup
# Strip code + obvious non-article chrome
for sel in ["pre", "code", ".highlight", "[class*=code]", "script", "style", "nav", "header", "footer", "aside"]:
for el in main.select(sel):
el.decompose()
text = " ".join(main.stripped_strings)
return WORD_RE.findall(text)
def collect_all_post_urls():
urls = set()
seen_index_pages = set()
index_url = START_URL
for _ in range(MAX_INDEX_PAGES):
if not index_url or index_url in seen_index_pages:
break
seen_index_pages.add(index_url)
html = get(index_url)
soup = BeautifulSoup(html, "html.parser")
for u in extract_index_post_links(soup, index_url):
urls.add(u)
next_page = find_next_index_page(soup, index_url)
index_url = next_page
time.sleep(PAUSE_S)
return sorted(urls)
def main():
print(f"Gathering post URLs from index starting at {START_URL} …")
posts = collect_all_post_urls()
print(f"Found {len(posts)} candidate posts.")
total = 0
per_post = []
for i, url in enumerate(posts, 1):
try:
html = get(url)
words = extract_article_text_words(html)
count = len(words)
total += count
per_post.append((url, count))
print(f"[{i:>3}/{len(posts)}] {count:>6} words — {url}")
time.sleep(PAUSE_S)
except Exception as e:
print(f"[ERR] {url}: {e}")
print("\n==== Summary ====")
print(f"Posts counted: {len(per_post)}")
print(f"Total words (excluding code): {total}")
if per_post:
top = sorted(per_post, key=lambda x: x[1], reverse=True)[:10]
print("\nTop 10 by word count:")
for url, cnt in top:
print(f"{cnt:>6} {url}")
if __name__ == "__main__":
main()