-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocs.py
More file actions
126 lines (109 loc) · 3.88 KB
/
docs.py
File metadata and controls
126 lines (109 loc) · 3.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from staticjinja import Site
from pygments import highlight, token
from pygments.lexers import PythonLexer
from pygments.formatters import HtmlFormatter
import sys
import os
import markdown
import pyterrier_prebuilt as pb
INDEX_DIR="./indices/"
class PythonFunctionLexer(PythonLexer):
def get_tokens(self, s):
# A version of PythonLexer that also highlights functions when they are called
it = super().get_tokens(s)
prev = next(it)
for tok in it:
if prev[0] == token.Name and tok == (token.Punctuation, '('):
yield prev[0].Function, prev[1]
else:
yield prev
prev = tok
yield prev
def python_pprint(code):
return highlight(code, PythonFunctionLexer(ensurenl=False), HtmlFormatter(nowrap=True)).replace('\n', '<br/>')
def sizeof_fmt(num, suffix='B'):
for unit in ['','K','M','G','T','P','E','Z']:
if abs(num) < 1024.0:
return "%3.1f%s%s" % (num, unit, suffix)
num /= 1024.0
return "%.1f%s%s" % (num, 'Yi', suffix)
def dirsize(path):
import os
#initialize the size
total_size = 0
#use the walk() method to navigate through directory tree
for dirpath, dirnames, filenames in os.walk(path):
for i in filenames:
#use join to concatenate all the components of path
f = os.path.join(dirpath, i)
#use getsize to generate size in bytes and add it to the total size
total_size += os.path.getsize(f)
return total_size
def variant_size(dataset, variant):
import os
symlink_path = os.path.join(INDEX_DIR, dataset, variant, "latest")
return sizeof_fmt(dirsize(symlink_path))
def variant_date(dataset, variant):
import os
symlink_path = os.path.join(INDEX_DIR, dataset, variant, "latest")
dest = os.path.realpath(symlink_path)
return os.path.basename(dest)
if __name__ == "__main__":
args = sys.argv
config = {
"datasets" : []
}
import pyterrier_prebuilt as pb
datasets = ["vaswani", "msmarco_document", "msmarco_passage", "msmarcov2_document", "msmarcov2_passage", "trec-covid"]
for d in datasets:
print("Dataset %s" % d)
meta = pb.get_thing(d, "bla", "DOC_INFO")
meta["name"] = d
meta["desc"] = markdown.markdown(meta["desc"])
config["datasets"].append(meta)
variants = pb.get_variants(d, INDEX_DIR)
meta["variants"] = []
meta["lastupdate"] = "(unknown)"
notebook_loc = os.path.join(INDEX_DIR, d, "retrieval.html")
meta["notebook_present"] = os.path.exists(notebook_loc)
print("Notebook found at %s: %s" % (notebook_loc, meta["notebook_present"]))
for v in sorted(variants, key=lambda x: (not x.startswith('terrier'), x)):
print("Variant %s" % v)
vmeta = {
"name" : v,
"desc" : markdown.markdown(pb.get_thing(d, v, 'get_variant_description')(v)),
"pipes_header" : pb.get_thing(d, v, 'get_retrieval_head')(d,v),
"pipes" : pb.get_thing(d, v, 'get_retrieval_pipelines')(d,v)
}
if vmeta["pipes_header"] is None:
vmeta["pipes_header"] = []
vmeta['example'] = python_pprint('\n\n'.join(
vmeta["pipes_header"]
+
[f'{l} = {pb.format_pipeline(r)}' for l, r in vmeta['pipes']]
))
vmeta["lastupdate"] = variant_date(d, v)
vmeta["size"] = variant_size(d, v)
# our string dates sort lexographically
if vmeta["lastupdate"] > meta["lastupdate"]:
meta["lastupdate"] = vmeta["lastupdate"]
meta["variants"].append(vmeta)
meta["variant_count"] = len(variants)
#print(meta)
configmap = { d["name"] : d for d in config["datasets"] }
def get_dataset_context(template):
import os
print(template.filename)
filename = os.path.basename(template.filename)
if not ".dataset.html" in filename:
return {}
datasetname = filename.split(".")[0]
return {"dataset" : configmap[datasetname]}
site = Site.make_site(
outpath='wwwroot/',
env_globals=config,
contexts=[('.*.dataset.html', get_dataset_context)], )
if args[0] == '--reload':
site.render(use_reloader=True)
else:
site.render()