This repository was archived by the owner on Dec 17, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgetactivity_mailman.py
executable file
·87 lines (83 loc) · 3.68 KB
/
getactivity_mailman.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python
import argparse
from lxml import html
import requests
from lib import util
from lib.backend import Session
from lib.backend.model import ActivityInMailman
from datetime import datetime
def get_activity(verbose=False):
lists = util.list_mailman_lists(verbose)
for l in lists:
if verbose: print 'Processing activity for %s...' % l['name']
latest = Session.query(ActivityInMailman)\
.filter(ActivityInMailman.list_name==l['name'])\
.order_by(ActivityInMailman.message_id.desc())\
.first()
# Walk through message history from the web front-end
archive_url = l['link'].replace('mailman/listinfo','pipermail')
limit = 1000
latest_id = latest.message_id if latest else -1
for msg in _yield_messages(archive_url,latest_id, verbose=verbose):
if verbose: print ' -> got msg #%d (%s: "%s")' % (msg['id'],msg['email'],msg['subject'])
Session.add( ActivityInMailman(
list_name = l['name'],
message_id = msg['id'],
subject = msg['subject'],
author = msg['author'],
email = msg['email'],
link = msg['link'],
timestamp = msg['date'] ) )
limit -= 1
#if limit==0:
#if verbose: print ' -> Reached activity limit (100)'
#break;
Session.commit()
def _yield_messages(url, latest_id, verbose=False):
if verbose: print 'Fetching list index %s...' % url
r = requests.get(url)
tree = html.fromstring(r.text)
links = tree.cssselect('a:nth-child(4)')
unique_ids = set()
for link in links:
month_url = url+'/'+link.attrib['href']
base_url = month_url.replace('date.html','')
r = requests.get(month_url)
tree = html.fromstring(r.text)
ul = tree.cssselect('ul')[1]
lis = ul.cssselect('li')
# Why does mailman serve me ascending chronological order?
lis.reverse()
for li in lis:
a = li.cssselect('a')
out = {
'author' : li.cssselect('i')[0].text_content().strip(),
'link' : base_url + a[0].attrib['href'],
'subject' : a[0].text_content().strip(),
'id' : int( a[1].attrib['name'] )
}
if out['id'] in unique_ids:
if verbose: print ' -> BROKEN LIST violates unique ID constraint (id=%d)' % out['id']
continue
unique_ids.add(out['id'])
if latest_id>=0 and out['id']<=latest_id:
if verbose: print ' -> No further messages'
return
# Download further message details (date & author) from the message's page
r = requests.get(out['link'])
msg_tree = html.fromstring(r.text)
tmp_date = msg_tree.cssselect('i')[0].text_content()
# Note: Platform-specific implementations can't all handle BST/GMT strings
tmp_date = tmp_date.replace('GMT ','').replace('BST ','')
try:
out['date'] = datetime.strptime(tmp_date,'%a %b %d %H:%M:%S %Y')
except ValueError as e:
if verbose: print 'Couldnt handle date "%s"' % tmp_date
return
out['email'] = msg_tree.cssselect('a')[0].text_content().strip().replace(' at ','@')
yield out
if __name__=='__main__':
parser = argparse.ArgumentParser(description='Scrape Mailman for list data and recent activity.')
parser.add_argument('-v', '--verbose', action='store_true', dest='verbose', help='Verbose output')
arg = parser.parse_args()
get_activity( verbose=arg.verbose )