-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCorpusProcessor.py
42 lines (23 loc) · 971 Bytes
/
CorpusProcessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import json
import requests
def prepare_dataset():
tranlsated_file = open("translated.txt", "r")
articles_map = {}
for line in tranlsated_file:
key, article_json = line.split("\t")
articles_map[key] = json.loads(article_json)
spanish_file = open("Spanish_articles.tsv", "r")
relevant_articles = []
for line in spanish_file:
article_url, type = line.split("\t")[:2]
article_id = article_url.split("id=")[1]
print article_id
if type is "Irrelevant":
continue
if article_id in articles_map:
response = requests.get(article_url)
article_es = json.loads(response.content)["data"]
articles_map[article_id]["content_es"] = article_es["content"]
relevant_articles.append(articles_map[article_id])
json.dump(relevant_articles, open("parallel_corpus.json", "w+"))
print("Saved Articles: "+ len(relevant_articles))