Skip to content

Commit fd121b9

Browse files
committed
new database update algorithm - fixes #28
A slight change in containers.Item was deemed useful. Tests were also updated and fixed where necessary.
1 parent 261089d commit fd121b9

File tree

3 files changed

+223
-48
lines changed

3 files changed

+223
-48
lines changed

filmatyk/containers.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1+
from __future__ import annotations
12
from datetime import date
23

34
# This is a globally used dict that binds Item classes to their names.
45
# It should remain empty, as the classes register themselves here.
56
classByString = {}
67

8+
79
class Blueprint(object):
810
"""Blueprint is an abstraction of a property that an Item might have.
911
@@ -30,6 +32,9 @@ class Blueprint(object):
3032
Static methods define some basic, commonly used presentation functions for
3133
known types of properties.
3234
"""
35+
36+
# Presentation styling callables
37+
3338
@staticmethod
3439
def _default(x): return str(x)
3540

@@ -57,6 +62,8 @@ def _rating(x):
5762
def _favourite(x):
5863
return '♥' if x == 1 else ' '
5964

65+
# Functionality
66+
6067
def __init__(self, name:str, colwidth:int, parsing:dict={}, display=None, store=True):
6168
self.display_name = name
6269
self.column_width = colwidth
@@ -76,6 +83,7 @@ def getHeading(self):
7683
def getColWidth(self):
7784
return self.column_width
7885

86+
7987
class UserData(object):
8088
"""Encapsulates user information associated with each Item instance.
8189
@@ -144,6 +152,7 @@ def serialize(self):
144152
serial['rating'] = self.wantto
145153
return serial
146154

155+
147156
class BlueprintInheritance(type):
148157
"""Changes the way inheritance works for Blueprints. Crucial for Item class.
149158
@@ -176,6 +185,7 @@ def __new__(cls, name, bases, dct):
176185
# The new class is now ready
177186
return c
178187

188+
179189
class Item(metaclass=BlueprintInheritance):
180190
"""Base for all types of records used by Filmweb and in the program.
181191
@@ -318,6 +328,21 @@ def asDict(self):
318328
_dict['userdata'] = self.userdata.serialize()
319329
return _dict
320330

331+
def update(self, other:Item):
332+
"""Update own properties from another Item.
333+
334+
This is useful if the Item's Blueprinted properties have been altered (e.g.
335+
because the remote data was updated) but there is also some custom data
336+
attached to the Item that should not be removed.
337+
338+
Important note: currently there are no properties requiring this behavior.
339+
"""
340+
for prop in self.storables:
341+
if prop in other.properties.keys():
342+
self.properties[prop] = other.properties[prop]
343+
self.userdata.addRating(other.userdata.rating)
344+
345+
321346
class Movie(Item):
322347
"""Item subclass specialized to hold Movie instances."""
323348
TYPE_STRING = 'FILM'
@@ -349,6 +374,7 @@ class Movie(Item):
349374
def __init__(self, userdata:dict={}, **properties):
350375
super(Movie, self).__init__(userdata, **properties)
351376

377+
352378
class Series(Movie):
353379
"""Item subclass specialized to hold Series instances.
354380
@@ -366,6 +392,7 @@ class Series(Movie):
366392
def __init__(self, userdata:dict={}, **properties):
367393
super(Series, self).__init__(userdata, **properties)
368394

395+
369396
class Game(Item):
370397
"""Item subclass specialized to hold Game instances.
371398

filmatyk/database.py

Lines changed: 179 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import containers
66
from filmweb import ConnectionError, FilmwebAPI
77

8+
89
class Database(object):
910
def __init__(self, itemtype:str, api:FilmwebAPI, callback:callable):
1011
self.itemtype = itemtype
@@ -22,6 +23,7 @@ def getItemByID(self, id:int):
2223
for item in self.items:
2324
if item.getRawProperty('id') == id:
2425
return item
26+
return None
2527

2628
def __iter__(self):
2729
return self.items.__iter__()
@@ -43,53 +45,194 @@ def storeToString(self):
4345

4446
# Data acquisition
4547
def softUpdate(self):
46-
self.callback(0) #display the progress bar
47-
# ask the API how many items should there be and how many are there per page
48+
"""Quickly pull the most recent changes from Filmweb.
49+
50+
The algorithm allows detecting additions and removals of items by comparing
51+
the total item count in the local and remote databases and keeping track of
52+
the items that differ between the two.
53+
There are two fundamental problems that it solves are related to the fact
54+
that getting to know the full state of the remote database is a very time-
55+
consuming operation (it can only be fetched in chunks of n items, usually
56+
n=25). Therefore the first problem is to determine how many pages to read
57+
from the remote database. The second problem is related to detecting when
58+
an item has been deleted remotely.
59+
The solution can be described in the following * steps:
60+
* compare the item counts between the databases - an update is to be made
61+
only if there is a difference,
62+
* fetch a new chunk of the remote database (a page),
63+
* detect which items have been added or changed with respect to the local
64+
(this makes use of a special HashedItem class, see its docs for details),
65+
* identify the last non-changed remote item and find its local counterpart,
66+
* split the local database into two parts:
67+
* a "changed" part comprises all items up to and including this last non-
68+
changed item - all these items are a potentially obsolete state of the
69+
database, and they could be simply replaced with the currently held
70+
remote items,
71+
* an "unchanged" part comprises all other items - nothing about their
72+
remote counterparts is known at this time.
73+
* check whether merging the currently held remote items with the possibly
74+
up-to-date unchanged part of the local database can satisfy the general
75+
condition (that the local and remote databases should count the same).
76+
At some point either the counts will even out, or all of the remote items
77+
will be loaded. In either case the update completes.
78+
79+
The problem this algorithm solves has one special form that is impossible
80+
to overcome: when a symmetric change has occurred past a certain page. In
81+
this case, any count-based algorithm will stop at the first chance it can
82+
get (when it notices a count balance), ignoring any additions and removals
83+
that may happen on further pages, if they balance out.
84+
Example:
85+
total change: length += 3
86+
page 1: 4 additions, 1 removal
87+
page 2: 1 addition, 1 removal
88+
The algorithm will reach balance after page 1 and not move on to page 2.
89+
90+
Returns True in case of success, False if it aborted before completion.
91+
"""
92+
# Display the progress bar
93+
self.callback(0)
94+
# Ask the API how many items should there be (abort on network problems)
4895
try:
49-
# in case there are network problems
50-
first_request = self.api.getNumOf(self.itemtype)
96+
num_request = self.api.getNumOf(self.itemtype)
5197
except ConnectionError:
52-
self.callback(-1, abort=True) #hide the progress bar
53-
return None
54-
if first_request is None:
55-
#this will happen if the user fails to log in
5698
self.callback(-1, abort=True)
57-
return None
58-
rated, per_page = first_request
59-
# compute how many pages should be requested
60-
if not rated or not per_page:
61-
# will happen if the user does not have any items in the list
62-
self.callback(-1)
63-
return None
64-
pages = ceil((rated-len(self.items))/per_page)
65-
# request these pages from the API
66-
itemPages = []
67-
for page in range(1, pages + 1):
68-
itemPages.append(self.api.getItemsPage(itemtype=self.itemtype, page=page))
69-
perc_done = int(100 * page / pages)
70-
self.callback(perc_done) #increment the progress bar
71-
self.callback(100) #correct the rounding error - set the bar to full
72-
new_items = [item for page in itemPages for item in page]
73-
# no need to do anything if no new items were acquired
74-
if len(new_items) == 0:
99+
return False
100+
# Exit if the user failed to log in
101+
if num_request is None:
102+
self.callback(-1, abort=True)
103+
return False
104+
# Workload estimation
105+
local_count = len(self.items)
106+
remote_count, items_per_page = num_request
107+
still_need = remote_count - local_count
108+
# Exit if nothing to download
109+
if not remote_count or not items_per_page or not still_need:
75110
self.callback(-1)
76-
return False # just in case this was an error during a hardUpdate
77-
# add items to the database, replacing duplicates by new ones
78-
old_items = self.items
111+
return False
112+
# Convert the existing database to a hashed format
113+
local_hashed = list(HashedItem(item) for item in self.items)
114+
local_hashed_dict = {item.id: item for item in local_hashed}
115+
# Prepare to and run the main loop
116+
remote_page_no = 0
117+
remote_items = []
118+
local_changed = []
119+
local_unchanged = []
120+
while still_need:
121+
# Fetch a page and represent it in the hashed form
122+
remote_page_no += 1
123+
fetched_items = list(
124+
HashedItem(item) for item in
125+
self.api.getItemsPage(self.itemtype, page=remote_page_no)
126+
)
127+
# Detect additions and changes among the new items
128+
for item in fetched_items:
129+
local_item = local_hashed_dict.get(item.id, None)
130+
# If this ID was not among known items - it's a simple addition
131+
if not local_item:
132+
item.added = True
133+
item.changed = True
134+
else:
135+
# If it was, check if the data differs to detect a change
136+
item.added = False
137+
item.changed = item.hash != local_item.hash
138+
# Store its local counterpart for a safe update
139+
item.local_item = local_item
140+
# Join the new items with the previously acquired but unprocessed ones
141+
remote_items.extend(fetched_items)
142+
# One edge case is that all of the remote items have been just acquired.
143+
# This would happen when updating the Database for the first time.
144+
if len(remote_items) == remote_count:
145+
local_changed = local_hashed
146+
local_unchanged = []
147+
break
148+
# If the last remote item has been changed, it is difficult to figure out
149+
# how do the currently known remote items relate to the local database.
150+
# In such a case, another page is fetched, allowing a better view.
151+
if remote_items[-1].changed:
152+
continue
153+
# Otherwise, locate the item in the local Database and split it.
154+
last_unchanged_pos = local_hashed.index(remote_items[-1].id) + 1
155+
local_changed = local_hashed[:last_unchanged_pos]
156+
local_unchanged = local_hashed[last_unchanged_pos:]
157+
# Check if the databases would balance out if they were merged right now.
158+
still_need = remote_count - (len(remote_items) + len(local_unchanged))
159+
# At this point the database can be reconstructed from the two components.
160+
new_items = []
161+
# First, incorporate the changes from the remotely acquired items
162+
for item in remote_items:
163+
# If the item had a local counterpart, do not throw it away but instead
164+
# update it with the remotely acquired data (allows preserving any local
165+
# data that might not originate at the remote database).
166+
if item.local_item:
167+
local_item = item.local_item.parent
168+
local_item.update(item.parent)
169+
new_items.append(local_item)
170+
else:
171+
new_items.append(item.parent)
172+
# Then add the rest of unchanged items.
173+
new_items.extend(item.parent for item in local_unchanged)
79174
self.items = new_items
80-
new_ids = [item['id'] for item in new_items]
81-
for item in old_items:
82-
if item['id'] not in new_ids:
83-
self.items.append(item)
175+
# Finalize - notify the GUI and potential caller.
84176
self.callback(-1)
85177
self.isDirty = True
86178
return True
87179

88180
def hardUpdate(self):
89-
# in theory, this removes all existing items and recollects the whole data
90-
# but in practice this reacquisition may fail - in which case we shouldn't
91-
# just lose the existing database and shrug, so this backs it up first
181+
"""Drop all the Items and reload all the data.
182+
183+
This uses softUpdate under the hood. In case of its failure, no data is
184+
lost as everything is backed up first.
185+
"""
92186
old_items = self.items
93187
self.items = []
94188
if not self.softUpdate():
95189
self.items = old_items
190+
191+
192+
class HashedItem():
193+
"""A hashed representation of an Item that allows detecting changes.
194+
195+
Computing a standard hash of the Item's UserData makes it possible to detect
196+
when an Item was not just added or removed but also whether it has changed
197+
with respect to the locally stored version of that Item.
198+
Flags indicating whether an item was added or changed are helpful in the
199+
process of performing an update.
200+
201+
In theory, the Item class itself could implement the hashing functionality,
202+
but doing this in a separate technical class also allows storing the flags,
203+
which would only clutter the base class.
204+
205+
Some caveats:
206+
* HashedItem also maintains a reference to the original item that it has been
207+
created from. This is convenient during the update operation, as it allows
208+
operating directly on the list of HashedItems instead of having to ensure
209+
that each list-changing operation happens both on the list of hashes and
210+
the list of the original items.
211+
* When used to hash a remotely acquired item, the corresponding local version
212+
of that item can be attached to the HashedItem. This saves an additional
213+
search operation later in the update process.
214+
* HashedItem can be equality-compared with not only instances of the same
215+
type, but also ints. This makes it possible to search for an integer ID in
216+
a list of HashedItems.
217+
"""
218+
hash_data = ['rating', 'comment', 'dateOf']
219+
220+
def __init__(self, item:containers.Item):
221+
self.parent = item
222+
self.id = item.getRawProperty('id')
223+
self.hash = self.computeHash(item)
224+
# Flags used to compare remote items with the local ones
225+
self.added = None
226+
self.changed = None
227+
self.local_item = None
228+
229+
def computeHash(self, item:containers.Item):
230+
"""Summarize UserData of an Item by a simple hash function."""
231+
userDataString = '#'.join(item[prop] for prop in self.hash_data)
232+
return hash(userDataString)
233+
234+
def __eq__(self, other):
235+
if isinstance(other, int):
236+
return self.id == other
237+
else:
238+
return super(HashedItem, self).__eq__(other)

0 commit comments

Comments
 (0)