This repository was archived by the owner on Sep 28, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtally_age.py
109 lines (85 loc) · 3.33 KB
/
tally_age.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
import sys
import pathlib
import re
import json
import datetime
import re
import statistics
from consolidate import consolidate
from extract_metadata import extract_metadata
from score import score
from slugify import slugify
from utilities import get_next_filename, parse_config_arguments, classify_occurrence, delineate_segments, COLUMNS
def file_age(file, today, age_list):
if pathlib.Path(file).suffix != '.md':
return
try:
content = pathlib.Path(file).read_text(errors="replace")
except UnicodeDecodeError:
print("tally_age, WARNING, Skipping file that contains non-UTF-8 characters and should be converted, , {}".format(file))
return
# Find ms.date line in the content. Regex accomodates odd variations found in repos, such
# as excess whitespace, quotes around dates.
msdate = re.findall("ms.date:\s*\"?(\d{1,2}\/\d{1,2}\/\d{2,4})\"?", content)
# Content check: if metadata_text is empty, then the article lacks metadata
if len(msdate) == 0:
print("tally_age, File contains no metadata--skipping, , {}".format(file))
return
try:
article_date = datetime.datetime.strptime(msdate[0], "%m/%d/%Y")
age = (today.date() - article_date.date()).days
age_list.append(age)
except ValueError:
print(f"Skipping: malformed date in {file}")
def collect_stats(dict, data):
dict["values"] = data
if len(data) > 0:
dict["mean"] = statistics.mean(data)
else:
dict["mean"] = 0
if len(data) > 1:
dict["stddev"] = statistics.stdev(data)
else:
dict["stddev"] = 0
def tally_age(root_path, save_file):
results = {}
today = datetime.datetime.now()
exclude_folders = ["media", "breadcrumb", ]
# For output, we want a dictionary with top-level folder names and an array
# of days-old values for all articles contained in that folder and any child
# folders. So each dictionary entry is itself a dictionary: "folder": { "mean": xxx, "stddev": yyy, "values"" array"}
results = {}
root_ages = []
# Iterate the top-level folders
for item in os.listdir(root_path):
full_path = os.path.join(root_path, item)
if not os.path.isdir(full_path):
# Process a file at the root
file_age(full_path, today, root_ages)
else:
# Process a folder
if item in exclude_folders:
continue
folder_result = {}
ages = []
# Iterate the entire subfolder, adding all article
# ages to the ages list.
for root, dirs, files in os.walk(full_path):
for exclusion in exclude_folders:
if exclusion in dirs:
dirs.remove(exclusion)
for file in files:
file_age(os.path.join(root, file), today, ages)
collect_stats(folder_result, ages)
results[item] = folder_result
collect_stats(results, root_ages)
with open(save_file, 'w') as fp:
json.dump(results, fp)
if __name__ == "__main__":
root_path = sys.argv[1]
json_path = sys.argv[2]
if root_path is None or json_path is None:
print("Usage: python tally_age.py <root-path> <json-file-path>")
sys.exit(2)
tally_age(root_path, json_path)