77
88import containers
99
10- class FilmwebAPI (object ):
11- class Constants (object ):
12- login_path = 'https://ssl.filmweb.pl/j_login'
13- base_path = 'https://www.filmweb.pl'
14- auth_error = 'błędny e-mail lub hasło' #TODO: be a bit more intelligent here
15- main_class = 'userVotesPage__results'
16- item_class = 'userVotesPage__result'
17- rating_source = 'userVotes'
18- rating_stype = 'application/json'
19- m_count_span = 'blockHeader__titleInfoCount'
20- s_count_span = 'blockHeader__titleInfoCount'
21- g_count_span = 'blockHeader__titleInfoCount'
22- @classmethod
23- def getUserPage (self , username ):
24- return self .base_path + '/user/' + username
25- @classmethod
26- def getUserMoviePage (self , username , page = 1 ):
27- userpage = self .getUserPage (username )
28- return userpage + '/films?page={}' .format (page )
29- @classmethod
30- def getUserSeriesPage (self , username , page = 1 ):
31- userpage = self .getUserPage (username )
32- return userpage + '/serials?page={}' .format (page )
33- @classmethod
34- def getUserGamePage (self , username , page = 1 ):
35- userpage = self .getUserPage (username )
36- return userpage + '/games?page={}' .format (page )
37-
38- ConnectionError = requests_html .requests .ConnectionError
3910
11+ ConnectionError = requests_html .requests .ConnectionError
12+
13+ class Constants ():
14+ """URLs and HTML component names for data acquisition.
15+
16+ Create an instance for a given user to generate their specific URLs.
17+ """
18+ login_path = 'https://ssl.filmweb.pl/j_login'
19+ base_path = 'https://www.filmweb.pl'
20+ main_class = 'userVotesPage__results'
21+ item_class = 'userVotesPage__result'
22+ rating_source = 'userVotes'
23+ rating_stype = 'application/json'
24+ movie_count_span = 'blockHeader__titleInfoCount'
25+ series_count_span = 'blockHeader__titleInfoCount'
26+ game_count_span = 'blockHeader__titleInfoCount'
27+
28+ def __init__ (self , username ):
29+ self .username = username
30+ self .userpage = self .getUserPage ()
31+
32+ def getUserPage (self ):
33+ return self .base_path + '/user/' + self .username
34+
35+ def getUserMoviePage (self , page = 1 ):
36+ return self .userpage + '/films?page={}' .format (page )
37+
38+ def getUserSeriesPage (self , page = 1 ):
39+ return self .userpage + '/serials?page={}' .format (page )
40+
41+ def getUserGamePage (self , page = 1 ):
42+ return self .userpage + '/games?page={}' .format (page )
43+
44+
45+ class FilmwebAPI ():
46+ """HTML-based API for acquiring data from Filmweb."""
4047 @staticmethod
4148 def login (username , password ):
49+ """Attempt to acquire an authenticated user session."""
4250 session = requests_html .HTMLSession ()
51+ auth_package = {
52+ 'j_username' : username ,
53+ 'j_password' : password ,
54+ }
55+ # Catch connection errors
4356 try :
44- log = session .post (
45- FilmwebAPI .Constants .login_path ,
46- data = {'j_username' : username , 'j_password' : password }
47- )
48- except FilmwebAPI .ConnectionError :
49- return (False , True ) # login error, connection error
50- if FilmwebAPI .Constants .auth_error in log .text :
57+ log = session .post (Constants .login_path , data = auth_package )
58+ except ConnectionError :
59+ return (False , True )
60+ # Catch authentication errors
61+ if len (session .cookies ) == 0 :
5162 print ('BŁĄD LOGOWANIA' )
5263 return (False , False )
5364 else :
5465 return (True , session )
5566
56- #@staticmethod -- but not actually a static method, see:
57- # https://stackoverflow.com/q/21382801/6919631
58- # https://stackoverflow.com/q/11058686/6919631
59- #in short: THIS METHOD SHOULD NEVER BE CALLED DIRECTLY
6067 def enforceSession (fun ):
61- #this decorator makes the function being called cause the caller
62- #(assumed to be the first object in the argument list) to call the
63- #session check method (which in turn can request a new session)
68+ """Decorator to mark API functions that require a live session.
69+
70+ It will perform a session check before calling the actual function.
71+ Because it assumes that the first argument of the wrapped function is
72+ a bound FilmwebAPI instance ("self"), it shall only be used with FilmwebAPI
73+ methods.
74+ Because it is meant to be used as a class-level function decorator, it has
75+ no real "self" argument. It is effectively something like a static method.
76+ See the following links for more info:
77+ https://stackoverflow.com/q/21382801/6919631
78+ https://stackoverflow.com/q/11058686/6919631
79+ The bottom line is that it should NEVER be called directly.
80+ """
6481 def wrapper (* args , ** kwargs ):
6582 self = args [0 ]
6683 if self .checkSession ():
@@ -69,43 +86,48 @@ def wrapper(*args, **kwargs):
6986 return None
7087 return wrapper
7188
72- def __init__ (self , callback , username :str = '' ):
89+ def __init__ (self , login_handler , username :str = '' ):
7390 self .username = username
74- self .requestLogin = callback
91+ self .constants = Constants (username )
92+ self .login_handler = login_handler
7593 self .session = None
7694 self .parsingRules = {}
77- self .__cacheAllParsingRules ()
95+ for container in containers .classByString .keys ():
96+ self .__cacheParsingRules (container )
7897 # bind specific methods and constants to their item types
7998 self .urlGenerationMethods = {
80- 'Movie' : self .Constants .getUserMoviePage ,
81- 'Series' : self .Constants .getUserSeriesPage ,
82- 'Game' : self .Constants .getUserGamePage
99+ 'Movie' : self .constants .getUserMoviePage ,
100+ 'Series' : self .constants .getUserSeriesPage ,
101+ 'Game' : self .constants .getUserGamePage
83102 }
84103 self .countSpanClasses = {
85- 'Movie' : self .Constants . m_count_span ,
86- 'Series' : self .Constants . s_count_span ,
87- 'Game' : self .Constants . g_count_span
104+ 'Movie' : self .constants . movie_count_span ,
105+ 'Series' : self .constants . series_count_span ,
106+ 'Game' : self .constants . game_count_span
88107 }
89108
90- def __cacheAllParsingRules (self ):
91- for container in containers .classByString .keys ():
92- self .__cacheParsingRules (container )
93-
94109 def __cacheParsingRules (self , itemtype :str ):
95- #get all the blueprints of a given class
110+ """Converts parsing rules for a given type into a neater representation.
111+
112+ The rules for each Blueprint are expressed in a human-readable and human-
113+ writable form that makes them easy to modify if need be, but not very
114+ convenient for the parser. This method groups rules in a parser-friendly
115+ representation that makes its job easier.
116+ """
117+ # Get all the blueprints of a given class
96118 rawRules = {}
97119 for key , val in containers .classByString [itemtype ].blueprints .items ():
98120 rawRules [key ] = val .getParsing ()
99- #convert them to a parsing tree
121+ # Convert them to a parsing tree
100122 pTree = {}
101123 classes = set (rule ['tag' ] for rule in rawRules .values () if rule is not None )
102124 for c in classes :
103125 pTree [c ] = {}
104126 for key , rule in rawRules .items ():
105- #ignore any non-parsable fields
127+ # Ignore any non-parsable fields
106128 if rule is None :
107129 continue
108- #process only the rules of class c
130+ # Process only the rules of class c
109131 if rule ['tag' ] != c :
110132 continue
111133 pClass = rule ['class' ]
@@ -116,44 +138,49 @@ def __cacheParsingRules(self, itemtype:str):
116138 'attr' : rule ['attr' ] if 'attr' in rule .keys () else None ,
117139 'type' : rule ['type' ] if 'type' in rule .keys () else None
118140 }
119- #store the result
141+ # Bind the result to a type name
120142 self .parsingRules [itemtype ] = pTree
121143
122144 def checkSession (self ):
123- #check if there is a session and acquire one if not
145+ """Check if there exists a live session and acquire a new one if not.
146+ #TODO: now with improved session handling we need something smarter
147+ (cause we'll nearly always have a session, except it might sometimes get stale
148+ resulting in an acquisition failure)
149+ """
124150 if not self .session :
125151 self .requestSession ()
126- #check again - in case the user cancelled a login
152+ # Check again - in case the user cancelled a login
127153 if not self .session :
128154 return False
129- #at this point everything is definitely safe
155+ # At this point everything is definitely safe
130156 return True
131157
132158 def requestSession (self ):
133- #call the GUI callback for login handling
134- #it will halt execution until the user logs in or cancels
135- session , username = self .requestLogin (self .username )
136- #if the session wasn't acquired, don't care about username
137- #but for good session, check if it agrees with the existing one (or set it)
159+ """Call the GUI to handle a login and bind a session object to self."""
160+ # This pauses execution until the user logs in or cancels
161+ session , username = self .login_handler (self .username )
138162 if session :
163+ # Set the username in case it's a first run (it will be empty)
139164 if not self .username :
140165 self .username = username
141166 else :
167+ # If it's not the first log in, make sure the user has logged as the
168+ # same user. If the GUI is to be trusted, it shouldn't be possible, but
169+ # we can still check in case of an accident during external usage.
170+ # Returned value isn't important. *NOT* setting self.session is.
142171 if username != self .username :
143- #this should never happen, if GUI is to be trusted
144- #returning is not as important as *not* setting self.session
145172 return None
146173 self .session = session
147174
148175 @enforceSession
149176 def getNumOf (self , itemtype :str ):
150- #return a tuple: ( number of rated movies, number of movies per page)
151- try :
152- getURL = self . urlGenerationMethods [ itemtype ]
153- spanClass = self . countSpanClasses [ itemtype ]
154- except KeyError :
155- return 0 , 0 # should never happen though
156- url = getURL (self . username )
177+ """Return the number of items of a given type that the user has rated.
178+
179+ Returns a tuple: (number of rated items, number of items per page).
180+ """
181+ getURL = self . urlGenerationMethods [ itemtype ]
182+ spanClass = self . countSpanClasses [ itemtype ]
183+ url = getURL ()
157184 page = self .fetchPage (url )
158185 # TODO: in principle, this page could be cached for some small time
159186 #the number of user's movies is inside a span of a specific class
@@ -169,21 +196,21 @@ def getNumOf(self, itemtype:str):
169196 for div in page .body .find_all ('div' ):
170197 if not div .has_attr ('data-id' ) or not div .has_attr ('class' ):
171198 continue
172- if not self .Constants .item_class in div .attrs ['class' ]:
199+ if not self .constants .item_class in div .attrs ['class' ]:
173200 continue
174201 per_page += 1
175202 return items , per_page
176203
177204 @enforceSession
178205 def getItemsPage (self , itemtype :str , page :int = 1 ):
179- # Get the URL of a page containing items of requested type, fetch and parse
180- # it and return data (as list of objects). Instead of several ifs, retrieve
181- # the proper methods from a dict prepared during init.
182- try :
183- getURL = self . urlGenerationMethods [ itemtype ]
184- except KeyError :
185- return [] # should never happen though
186- url = getURL (self . username , page )
206+ """Acquire items of a given type from a given page number.
207+
208+ The user's ratings are displayed on pages. This fetches a page by number,
209+ parses it and returns a list of Item-based objects. URL is delivered by a
210+ cached dict, binding URL-generating functions to respective item types.
211+ """
212+ getURL = self . urlGenerationMethods [ itemtype ]
213+ url = getURL (page )
187214 page = self .fetchPage (url )
188215 data = self .parsePage (page , itemtype )
189216 return data
@@ -216,11 +243,11 @@ def parsePage(self, page, itemtype:str):
216243
217244 def extractDataSource (self , page ):
218245 """Extract the div that holds all the data."""
219- return page .find ('div' , attrs = {'class' : self .Constants .main_class })
246+ return page .find ('div' , attrs = {'class' : self .constants .main_class })
220247
221248 def extractItems (self , div ):
222249 """From the main div, extract all divs holding item details."""
223- sub_divs = div .find_all ('div' , attrs = {'class' : self .Constants .item_class })
250+ sub_divs = div .find_all ('div' , attrs = {'class' : self .constants .item_class })
224251 sub_divs = [div for div in sub_divs if div .has_attr ('data-id' )]
225252 return sub_divs
226253
@@ -229,8 +256,8 @@ def extractRatings(self, div):
229256
230257 They're held in a specific span as <script> contents.
231258 """
232- span = div .find ('span' , attrs = {'data-source' : self .Constants .rating_source })
233- scripts = span .find_all ('script' , attrs = {'type' : self .Constants .rating_stype })
259+ span = div .find ('span' , attrs = {'data-source' : self .constants .rating_source })
260+ scripts = span .find_all ('script' , attrs = {'type' : self .constants .rating_stype })
234261 ratings = [script .getText () for script in scripts ]
235262 return ratings
236263
@@ -278,19 +305,19 @@ def parseRating(self, text):
278305 Item's addRating method.
279306 """
280307 origDict = json .loads (text )
281- #ensure all date keys are present
308+ # Ensure all date keys are present
282309 try :
283310 date_ = origDict ['d' ]
284311 except KeyError :
285- # once I've seen a bugged span in which this key was not present
312+ # I've seen a bugged span once, which didn't have this key
286313 date_ = {'y' :2000 , 'm' :1 , 'd' :1 }
287314 if 'm' not in date_ .keys ():
288315 date_ ['m' ] = 1
289316 if 'd' not in date_ .keys ():
290317 date_ ['d' ] = 1
291- # unescape HTML-coded characters from the comment
318+ # Unescape HTML-coded characters from the comment
292319 comment = html .unescape (origDict ['c' ] if 'c' in origDict .keys () else '' )
293- # translate that dict to more readable standard
320+ # Translate that dict to a more readable standard
294321 iid = origDict ['eId' ]
295322 isFaved = origDict ['f' ] if 'f' in origDict .keys () else 0
296323 ratingDict = {
0 commit comments