-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrss_lib.py
executable file
·134 lines (117 loc) · 3.19 KB
/
rss_lib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Reads RSS feed and formats it to fit onto an LCD display.
# Returns list of post, where each post is a list of strings
# with appropriate length.
#
import feedparser
import time
import sys
import getopt
import html2text
import re
url = 'http://www.tagesschau.de/xml/rss2'
db = 'feeds.db'
limit = 12 * 3600 * 1000
#
# functions to get the current time
#
current_time_millis = lambda: int(round(time.time() * 1000))
current_timestamp = current_time_millis()
class rss_reader:
url = ''
db = ''
text_list = []
width = 20
def __init__(self, url, db, width=20):
self.url = url
self.db = db
self.text_list = []
self.width = width
def post_is_in_db(self, title):
""" Check if title is already in database """
try:
with open(self.db, 'r') as database:
for line in database:
if title in line:
return True
except:
return False
return False
def post_is_in_db_with_old_timestamp(self, title):
""" true if the title is in the db with timestamp > limit """
try:
with open(self.db, 'r') as database:
for line in database:
if title in line:
ts_as_string = line.split('|', 1)[1]
ts = long(ts_as_string)
if current_timestamp - ts > limit:
return True
except:
return False
return False
def parse_feeds(self):
""" get the feed data from the url """
feed = feedparser.parse(self.url)
posts_to_print = []
posts_to_skip = []
for post in feed.entries:
# if post is already in the database, skip it
# TODO check the time
title = post.title
if self.post_is_in_db_with_old_timestamp(title):
posts_to_skip.append(title)
else:
posts_to_print.append(post)
# add all the posts we're going to print to the database with the
# current timestamp (but only if they're not already in there)
f = open(self.db, 'a')
for post in posts_to_print:
if not self.post_is_in_db(post.title):
f.write(post.title.encode("utf-8") + "|" +
str(current_timestamp) + "\n")
f.close
# output all of the new posts
count = 1
for post in posts_to_print:
h2t = html2text.HTML2Text()
h2t.inline_links = False
h2t.ignore_links = True
h2t.ignore_images = True
h2t.ignore_emphasis = True
h2t.skip_internal_links = True
h2t.body_width = self.width
text = []
date = time.strftime("%d.%m %H:%M") + ' ' + '[' + str(count) + ']'
text.append(date)
title = h2t.handle(post.title)
for line in title.split('\n'):
if not line.strip():
pass
else:
text.append(str(line.encode('utf-8')))
# Try to filter out links even before passing to the formatter
# Example: <a href='http://earth.google.com/'>world</a>
pattern =r'\[*<a.*?>.*?</a>\]*'
result = re.sub(pattern , "", post.description)
description = h2t.handle(result)
for line in description.split('\n'):
if line.startswith(" *"):
pass
elif not line.strip():
pass
else:
text.append(str(line.encode('utf-8')))
self.text_list.append(text)
count += 1
return self.text_list
if __name__ == "__main__":
rss = rss_reader(url, db, 20)
post_list = rss.parse_feeds()
for post in post_list:
print('*' * 22)
for line in post:
# print(type(line))
print(' ' + line)