-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis.py
119 lines (98 loc) · 3.34 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#%%
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
from tinydb import Query, TinyDB
#%% set up the DB
db = TinyDB("library.json", sort_keys=True, indent=4)
q = Query()
#%% Load the library CSV
book_table_df = pd.read_excel("Library Books.xlsx")
# book_table_df = book_table_df.head(20)
book_table_df.head()
# %% This section makes a DB of the books and authors:
# There are a lot of things still to do with this, i.e. there's no studio split etc.
book_table = db.table("books")
author_table = db.table("authors")
#%% Make a little viz of the subjects in the library
cut = 10
subjects = []
for s in book_table_df.Subject:
if type(s) is str:
subjects.extend(s.split("|"))
print(f"there are {len(set(subjects))} unique subjects")
svc = pd.Series(subjects).value_counts()
svc[svc > cut].plot(kind="barh", figsize=(6, 10))
plt.xlabel("count")
plt.title(f"Number of books with this subject\n(showing only those over {cut} books)")
plt.savefig("plots/subjects.png")
#%% Make a list of ESD books for adrian
# esd_books = book_table_df[["ESD" in str(sub) for sub in book_table_df.Subject]]
# esd_books.to_csv("esd_books.csv")
#%%
def get_role(a):
role = ""
if "(ed)" in a:
role = "editor"
return role
def insert_authors(authors_str, book_title):
ids = []
if type(authors_str) is str:
a_list = authors_str.split("|")
for a in a_list:
# role = get_role(a) #TODO: role is on a per book basis, so that relationship needs to be a bit more complicated
author_name = a.replace("(ed)", "").strip()
id = author_table.upsert(
{"author_name": author_name},
q.author_name == author_name,
)
ids.append(id[0])
return ids
# %%
for i, row in book_table_df.iterrows():
book_title = row.Title
authors_str = row.Author
subject = row.Subject
author_ids = insert_authors(authors_str, book_title)
book_id = book_table.upsert(
{"book_title": book_title, "authors": author_ids},
q.book_title == book_title,
)
# %% So now we have a DB, we can load it back in as dataframes and do some things to it
# Not the most efficient way to do this, I'm sure, but we do also get a DB while we're at it
with open("library.json", "r", encoding="utf-8") as lib_json:
library_json = json.load(lib_json)
#%%
bj = []
for k, d in library_json["books"].items():
print(k, d)
d["id"] = k
bj.append(d)
book_df = pd.DataFrame(bj)
book_df.set_index("id", inplace=True)
book_df.head()
#%%
aj = []
for k, d in library_json["authors"].items():
print(k, d)
d["id"] = k
aj.append(d)
author_df = pd.DataFrame(aj)
author_df.set_index("id", inplace=True)
author_df["book_list"] = author_df.apply(lambda _: [], axis=1)
author_df.head()
#%% Make a backreference to the books so we can tell who wrote what.
for i, row in book_df.iterrows():
if len(row.authors) != 0:
for a_id in row.authors:
old_bl = author_df.loc[str(a_id)].book_list
if type(old_bl) is tuple:
old_bl = old_bl[0]
new_bl = list(set(old_bl + [row.book_title]))
author_df.loc[str(a_id)].book_list = new_bl
author_df["book_count"] = author_df["book_list"].apply(lambda x: len(x))
author_df.head()
# %% dump to a csv to start adding extra columns
author_df.to_csv("authors.csv")
# %%