-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmetadata.py
74 lines (63 loc) · 2.05 KB
/
metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""
This code is copied from metadata.ipynb and adapted
to run as a pipeline step.
Input (stdin): A json with the jobs and metadata
Output (stdou): A json with the jobs and more metadata
This script must preserve metadata coming from the scrappers
and enrich it based on the text in the description.
"""
import sys
import json
import pandas as pd
import spacy
from spacy import displacy
from spacy.matcher import Matcher
from pathlib import Path
import operator as op
data = json.loads(sys.stdin.read())
df = pd.DataFrame(data, columns=["tags", "descrip", "title", "url", "file", "metadata"])
df.metadata = df.metadata.apply(lambda x: {} if pd.isna(x) else x)
nlp = spacy.load("en_core_web_sm")
location_mat = Matcher(nlp.vocab)
location_pats = [
[{"ENT_TYPE": "LOC"}],
[{"ENT_TYPE": "NORP"}],
[{"ENT_TYPE": "GPE"}],
[{"ENT_TYPE": "TIME"}],
[{"LOWER": "worldwide"}],
]
location_mat.add("LOCATION", location_pats)
remote_mat = Matcher(nlp.vocab)
remote_pats = [
[{"LOWER": "remote"}],
]
remote_mat.add("REMOTE", remote_pats)
def collect_matcher(m, doc, window=0):
return [doc[s-window:e+window].as_doc() for _, s, e in m(doc)]
def add_remote_metadata(row):
# Do not overwrite the scrapper
if "remote" in row.metadata:
return row.metadata
doc = nlp(row.descrip)
# Find remote word
remotes = collect_matcher(remote_mat, doc, 5)
if remotes:
locations = set(str(l).strip() for doc in remotes for l in collect_matcher(location_mat, doc))
if locations:
print("New remote location found", row.title, locations, file=sys.stderr)
return {
**row.metadata,
"remote": " ".join(sorted(map(str, locations))).strip()
}
return row.metadata
df["metadata"] = df.apply(add_remote_metadata, axis=1)
json_out = []
for _, row in df.iterrows():
json_out.append({
"title": row.title,
"descrip": row.descrip,
"url": row.url,
"tags": row.tags,
"metadata": row.metadata
})
print(json.dumps(json_out, indent=2))