Skip to content

Commit 81ea1d7

Browse files
committed
rename venues to sources
1 parent 16491ba commit 81ea1d7

File tree

4 files changed

+151
-150
lines changed

4 files changed

+151
-150
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
/.idea

copy-openalex-csv.sql

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,3 @@
1-
--institutions
2-
3-
\copy openalex.institutions (id, ror, display_name, country_code, type, homepage_url, image_url, image_thumbnail_url, display_name_acroynyms, display_name_alternatives, works_count, cited_by_count, works_api_url, updated_date) from program 'gunzip -c csv-files/institutions.csv.gz' csv header
4-
\copy openalex.institutions_ids (institution_id, openalex, ror, grid, wikipedia, wikidata, mag) from program 'gunzip -c csv-files/institutions_ids.csv.gz' csv header
5-
\copy openalex.institutions_geo (institution_id, city, geonames_city_id, region, country_code, country, latitude, longitude) from program 'gunzip -c csv-files/institutions_geo.csv.gz' csv header
6-
\copy openalex.institutions_associated_institutions (institution_id, associated_institution_id, relationship) from program 'gunzip -c csv-files/institutions_associated_institutions.csv.gz' csv header
7-
\copy openalex.institutions_counts_by_year (institution_id, year, works_count, cited_by_count) from program 'gunzip -c csv-files/institutions_counts_by_year.csv.gz' csv header
8-
91
--authors
102

113
\copy openalex.authors (id, orcid, display_name, display_name_alternatives, works_count, cited_by_count, last_known_institution, works_api_url, updated_date) from program 'gunzip -c csv-files/authors.csv.gz' csv header
@@ -20,13 +12,22 @@
2012
\copy openalex.concepts_ids (concept_id, openalex, wikidata, wikipedia, umls_aui, umls_cui, mag) from program 'gunzip -c csv-files/concepts_ids.csv.gz' csv header
2113
\copy openalex.concepts_related_concepts (concept_id, related_concept_id, score) from program 'gunzip -c csv-files/concepts_related_concepts.csv.gz' csv header
2214

23-
--venues
15+
--institutions
2416

25-
\copy openalex.venues (id, issn_l, issn, display_name, publisher, works_count, cited_by_count, is_oa, is_in_doaj, homepage_url, works_api_url, updated_date) from program 'gunzip -c csv-files/venues.csv.gz' csv header
26-
\copy openalex.venues_ids (venue_id, openalex, issn_l, issn, mag) from program 'gunzip -c csv-files/venues_ids.csv.gz' csv header
27-
\copy openalex.venues_counts_by_year (venue_id, year, works_count, cited_by_count) from program 'gunzip -c csv-files/venues_counts_by_year.csv.gz' csv header
17+
\copy openalex.institutions (id, ror, display_name, country_code, type, homepage_url, image_url, image_thumbnail_url, display_name_acroynyms, display_name_alternatives, works_count, cited_by_count, works_api_url, updated_date) from program 'gunzip -c csv-files/institutions.csv.gz' csv header
18+
\copy openalex.institutions_ids (institution_id, openalex, ror, grid, wikipedia, wikidata, mag) from program 'gunzip -c csv-files/institutions_ids.csv.gz' csv header
19+
\copy openalex.institutions_geo (institution_id, city, geonames_city_id, region, country_code, country, latitude, longitude) from program 'gunzip -c csv-files/institutions_geo.csv.gz' csv header
20+
\copy openalex.institutions_associated_institutions (institution_id, associated_institution_id, relationship) from program 'gunzip -c csv-files/institutions_associated_institutions.csv.gz' csv header
21+
\copy openalex.institutions_counts_by_year (institution_id, year, works_count, cited_by_count) from program 'gunzip -c csv-files/institutions_counts_by_year.csv.gz' csv header
22+
23+
--sources
24+
25+
\copy openalex.sources (id, issn_l, issn, display_name, publisher, works_count, cited_by_count, is_oa, is_in_doaj, homepage_url, works_api_url, updated_date) from program 'gunzip -c csv-files/sources.csv.gz' csv header
26+
\copy openalex.sources_ids (source_id, openalex, issn_l, issn, mag) from program 'gunzip -c csv-files/sources_ids.csv.gz' csv header
27+
\copy openalex.sources_counts_by_year (source_id, year, works_count, cited_by_count) from program 'gunzip -c csv-files/sources_counts_by_year.csv.gz' csv header
2828

2929
--works
30+
3031
\copy openalex.works (id, doi, title, display_name, publication_year, publication_date, type, cited_by_count, is_retracted, is_paratext, cited_by_api_url, abstract_inverted_index) from program 'gunzip -c csv-files/works.csv.gz' csv header
3132
\copy openalex.works_host_venues (work_id, venue_id, url, is_oa, version, license) from program 'gunzip -c csv-files/works_host_venues.csv.gz' csv header
3233
\copy openalex.works_alternate_host_venues (work_id, venue_id, url, is_oa, version, license) from program 'gunzip -c csv-files/works_alternate_host_venues.csv.gz' csv header

flatten-openalex-jsonl.py

Lines changed: 123 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -10,41 +10,6 @@
1010
FILES_PER_ENTITY = int(os.environ.get('OPENALEX_DEMO_FILES_PER_ENTITY', '0'))
1111

1212
csv_files = {
13-
'institutions': {
14-
'institutions': {
15-
'name': os.path.join(CSV_DIR, 'institutions.csv.gz'),
16-
'columns': [
17-
'id', 'ror', 'display_name', 'country_code', 'type', 'homepage_url', 'image_url', 'image_thumbnail_url',
18-
'display_name_acroynyms', 'display_name_alternatives', 'works_count', 'cited_by_count', 'works_api_url',
19-
'updated_date'
20-
]
21-
},
22-
'ids': {
23-
'name': os.path.join(CSV_DIR, 'institutions_ids.csv.gz'),
24-
'columns': [
25-
'institution_id', 'openalex', 'ror', 'grid', 'wikipedia', 'wikidata', 'mag'
26-
]
27-
},
28-
'geo': {
29-
'name': os.path.join(CSV_DIR, 'institutions_geo.csv.gz'),
30-
'columns': [
31-
'institution_id', 'city', 'geonames_city_id', 'region', 'country_code', 'country', 'latitude',
32-
'longitude'
33-
]
34-
},
35-
'associated_institutions': {
36-
'name': os.path.join(CSV_DIR, 'institutions_associated_institutions.csv.gz'),
37-
'columns': [
38-
'institution_id', 'associated_institution_id', 'relationship'
39-
]
40-
},
41-
'counts_by_year': {
42-
'name': os.path.join(CSV_DIR, 'institutions_counts_by_year.csv.gz'),
43-
'columns': [
44-
'institution_id', 'year', 'works_count', 'cited_by_count'
45-
]
46-
}
47-
},
4813
'authors': {
4914
'authors': {
5015
'name': os.path.join(CSV_DIR, 'authors.csv.gz'),
@@ -91,21 +56,56 @@
9156
'columns': ['concept_id', 'related_concept_id', 'score']
9257
}
9358
},
94-
'venues': {
95-
'venues': {
96-
'name': os.path.join(CSV_DIR, 'venues.csv.gz'),
59+
'institutions': {
60+
'institutions': {
61+
'name': os.path.join(CSV_DIR, 'institutions.csv.gz'),
62+
'columns': [
63+
'id', 'ror', 'display_name', 'country_code', 'type', 'homepage_url', 'image_url', 'image_thumbnail_url',
64+
'display_name_acroynyms', 'display_name_alternatives', 'works_count', 'cited_by_count', 'works_api_url',
65+
'updated_date'
66+
]
67+
},
68+
'ids': {
69+
'name': os.path.join(CSV_DIR, 'institutions_ids.csv.gz'),
70+
'columns': [
71+
'institution_id', 'openalex', 'ror', 'grid', 'wikipedia', 'wikidata', 'mag'
72+
]
73+
},
74+
'geo': {
75+
'name': os.path.join(CSV_DIR, 'institutions_geo.csv.gz'),
76+
'columns': [
77+
'institution_id', 'city', 'geonames_city_id', 'region', 'country_code', 'country', 'latitude',
78+
'longitude'
79+
]
80+
},
81+
'associated_institutions': {
82+
'name': os.path.join(CSV_DIR, 'institutions_associated_institutions.csv.gz'),
83+
'columns': [
84+
'institution_id', 'associated_institution_id', 'relationship'
85+
]
86+
},
87+
'counts_by_year': {
88+
'name': os.path.join(CSV_DIR, 'institutions_counts_by_year.csv.gz'),
89+
'columns': [
90+
'institution_id', 'year', 'works_count', 'cited_by_count'
91+
]
92+
}
93+
},
94+
'sources': {
95+
'sources': {
96+
'name': os.path.join(CSV_DIR, 'sources.csv.gz'),
9797
'columns': [
9898
'id', 'issn_l', 'issn', 'display_name', 'publisher', 'works_count', 'cited_by_count', 'is_oa',
9999
'is_in_doaj', 'homepage_url', 'works_api_url', 'updated_date'
100100
]
101101
},
102102
'ids': {
103-
'name': os.path.join(CSV_DIR, 'venues_ids.csv.gz'),
104-
'columns': ['venue_id', 'openalex', 'issn_l', 'issn', 'mag']
103+
'name': os.path.join(CSV_DIR, 'sources_ids.csv.gz'),
104+
'columns': ['source_id', 'openalex', 'issn_l', 'issn', 'mag']
105105
},
106106
'counts_by_year': {
107-
'name': os.path.join(CSV_DIR, 'venues_counts_by_year.csv.gz'),
108-
'columns': ['venue_id', 'year', 'works_count', 'cited_by_count']
107+
'name': os.path.join(CSV_DIR, 'sources_counts_by_year.csv.gz'),
108+
'columns': ['source_id', 'year', 'works_count', 'cited_by_count']
109109
},
110110
},
111111
'works': {
@@ -180,6 +180,57 @@
180180
}
181181

182182

183+
def flatten_authors():
184+
file_spec = csv_files['authors']
185+
186+
with gzip.open(file_spec['authors']['name'], 'wt', encoding='utf-8') as authors_csv, \
187+
gzip.open(file_spec['ids']['name'], 'wt', encoding='utf-8') as ids_csv, \
188+
gzip.open(file_spec['counts_by_year']['name'], 'wt', encoding='utf-8') as counts_by_year_csv:
189+
190+
authors_writer = csv.DictWriter(
191+
authors_csv, fieldnames=file_spec['authors']['columns'], extrasaction='ignore'
192+
)
193+
authors_writer.writeheader()
194+
195+
ids_writer = csv.DictWriter(ids_csv, fieldnames=file_spec['ids']['columns'])
196+
ids_writer.writeheader()
197+
198+
counts_by_year_writer = csv.DictWriter(counts_by_year_csv, fieldnames=file_spec['counts_by_year']['columns'])
199+
counts_by_year_writer.writeheader()
200+
201+
files_done = 0
202+
for jsonl_file_name in glob.glob(os.path.join(SNAPSHOT_DIR, 'data', 'authors', '*', '*.gz')):
203+
print(jsonl_file_name)
204+
with gzip.open(jsonl_file_name, 'r') as authors_jsonl:
205+
for author_json in authors_jsonl:
206+
if not author_json.strip():
207+
continue
208+
209+
author = json.loads(author_json)
210+
211+
if not (author_id := author.get('id')):
212+
continue
213+
214+
# authors
215+
author['display_name_alternatives'] = json.dumps(author.get('display_name_alternatives'), ensure_ascii=False)
216+
author['last_known_institution'] = (author.get('last_known_institution') or {}).get('id')
217+
authors_writer.writerow(author)
218+
219+
# ids
220+
if author_ids := author.get('ids'):
221+
author_ids['author_id'] = author_id
222+
ids_writer.writerow(author_ids)
223+
224+
# counts_by_year
225+
if counts_by_year := author.get('counts_by_year'):
226+
for count_by_year in counts_by_year:
227+
count_by_year['author_id'] = author_id
228+
counts_by_year_writer.writerow(count_by_year)
229+
files_done += 1
230+
if FILES_PER_ENTITY and files_done >= FILES_PER_ENTITY:
231+
break
232+
233+
183234
def flatten_concepts():
184235
with gzip.open(csv_files['concepts']['concepts']['name'], 'wt', encoding='utf-8') as concepts_csv, \
185236
gzip.open(csv_files['concepts']['ancestors']['name'], 'wt', encoding='utf-8') as ancestors_csv, \
@@ -256,57 +307,6 @@ def flatten_concepts():
256307
break
257308

258309

259-
def flatten_venues():
260-
with gzip.open(csv_files['venues']['venues']['name'], 'wt', encoding='utf-8') as venues_csv, \
261-
gzip.open(csv_files['venues']['ids']['name'], 'wt', encoding='utf-8') as ids_csv, \
262-
gzip.open(csv_files['venues']['counts_by_year']['name'], 'wt', encoding='utf-8') as counts_by_year_csv:
263-
264-
venues_writer = csv.DictWriter(
265-
venues_csv, fieldnames=csv_files['venues']['venues']['columns'], extrasaction='ignore'
266-
)
267-
venues_writer.writeheader()
268-
269-
ids_writer = csv.DictWriter(ids_csv, fieldnames=csv_files['venues']['ids']['columns'])
270-
ids_writer.writeheader()
271-
272-
counts_by_year_writer = csv.DictWriter(counts_by_year_csv, fieldnames=csv_files['venues']['counts_by_year']['columns'])
273-
counts_by_year_writer.writeheader()
274-
275-
seen_venue_ids = set()
276-
277-
files_done = 0
278-
for jsonl_file_name in glob.glob(os.path.join(SNAPSHOT_DIR, 'data', 'venues', '*', '*.gz')):
279-
print(jsonl_file_name)
280-
with gzip.open(jsonl_file_name, 'r') as venues_jsonl:
281-
for venue_json in venues_jsonl:
282-
if not venue_json.strip():
283-
continue
284-
285-
venue = json.loads(venue_json)
286-
287-
if not (venue_id := venue.get('id')) or venue_id in seen_venue_ids:
288-
continue
289-
290-
seen_venue_ids.add(venue_id)
291-
292-
venue['issn'] = json.dumps(venue.get('issn'))
293-
venues_writer.writerow(venue)
294-
295-
if venue_ids := venue.get('ids'):
296-
venue_ids['venue_id'] = venue_id
297-
venue_ids['issn'] = json.dumps(venue_ids.get('issn'))
298-
ids_writer.writerow(venue_ids)
299-
300-
if counts_by_year := venue.get('counts_by_year'):
301-
for count_by_year in counts_by_year:
302-
count_by_year['venue_id'] = venue_id
303-
counts_by_year_writer.writerow(count_by_year)
304-
305-
files_done += 1
306-
if FILES_PER_ENTITY and files_done >= FILES_PER_ENTITY:
307-
break
308-
309-
310310
def flatten_institutions():
311311
file_spec = csv_files['institutions']
312312

@@ -390,58 +390,57 @@ def flatten_institutions():
390390
break
391391

392392

393-
def flatten_authors():
394-
file_spec = csv_files['authors']
395-
396-
with gzip.open(file_spec['authors']['name'], 'wt', encoding='utf-8') as authors_csv, \
397-
gzip.open(file_spec['ids']['name'], 'wt', encoding='utf-8') as ids_csv, \
398-
gzip.open(file_spec['counts_by_year']['name'], 'wt', encoding='utf-8') as counts_by_year_csv:
393+
def flatten_sources():
394+
with gzip.open(csv_files['sources']['sources']['name'], 'wt', encoding='utf-8') as sources_csv, \
395+
gzip.open(csv_files['sources']['ids']['name'], 'wt', encoding='utf-8') as ids_csv, \
396+
gzip.open(csv_files['sources']['counts_by_year']['name'], 'wt', encoding='utf-8') as counts_by_year_csv:
399397

400-
authors_writer = csv.DictWriter(
401-
authors_csv, fieldnames=file_spec['authors']['columns'], extrasaction='ignore'
398+
sources_writer = csv.DictWriter(
399+
sources_csv, fieldnames=csv_files['sources']['sources']['columns'], extrasaction='ignore'
402400
)
403-
authors_writer.writeheader()
401+
sources_writer.writeheader()
404402

405-
ids_writer = csv.DictWriter(ids_csv, fieldnames=file_spec['ids']['columns'])
403+
ids_writer = csv.DictWriter(ids_csv, fieldnames=csv_files['sources']['ids']['columns'])
406404
ids_writer.writeheader()
407405

408-
counts_by_year_writer = csv.DictWriter(counts_by_year_csv, fieldnames=file_spec['counts_by_year']['columns'])
406+
counts_by_year_writer = csv.DictWriter(counts_by_year_csv, fieldnames=csv_files['sources']['counts_by_year']['columns'])
409407
counts_by_year_writer.writeheader()
410408

409+
seen_source_ids = set()
410+
411411
files_done = 0
412-
for jsonl_file_name in glob.glob(os.path.join(SNAPSHOT_DIR, 'data', 'authors', '*', '*.gz')):
412+
for jsonl_file_name in glob.glob(os.path.join(SNAPSHOT_DIR, 'data', 'sources', '*', '*.gz')):
413413
print(jsonl_file_name)
414-
with gzip.open(jsonl_file_name, 'r') as authors_jsonl:
415-
for author_json in authors_jsonl:
416-
if not author_json.strip():
414+
with gzip.open(jsonl_file_name, 'r') as sources_jsonl:
415+
for source_json in sources_jsonl:
416+
if not source_json.strip():
417417
continue
418418

419-
author = json.loads(author_json)
419+
source = json.loads(source_json)
420420

421-
if not (author_id := author.get('id')):
421+
if not (source_id := source.get('id')) or source_id in seen_source_ids:
422422
continue
423423

424-
# authors
425-
author['display_name_alternatives'] = json.dumps(author.get('display_name_alternatives'), ensure_ascii=False)
426-
author['last_known_institution'] = (author.get('last_known_institution') or {}).get('id')
427-
authors_writer.writerow(author)
424+
seen_source_ids.add(source_id)
428425

429-
# ids
430-
if author_ids := author.get('ids'):
431-
author_ids['author_id'] = author_id
432-
ids_writer.writerow(author_ids)
426+
source['issn'] = json.dumps(source.get('issn'))
427+
sources_writer.writerow(source)
433428

434-
# counts_by_year
435-
if counts_by_year := author.get('counts_by_year'):
429+
if source_ids := source.get('ids'):
430+
source_ids['source_id'] = source_id
431+
source_ids['issn'] = json.dumps(source_ids.get('issn'))
432+
ids_writer.writerow(source_ids)
433+
434+
if counts_by_year := source.get('counts_by_year'):
436435
for count_by_year in counts_by_year:
437-
count_by_year['author_id'] = author_id
436+
count_by_year['source_id'] = source_id
438437
counts_by_year_writer.writerow(count_by_year)
438+
439439
files_done += 1
440440
if FILES_PER_ENTITY and files_done >= FILES_PER_ENTITY:
441441
break
442442

443443

444-
445444
def flatten_works():
446445
file_spec = csv_files['works']
447446

@@ -590,9 +589,9 @@ def init_dict_writer(csv_file, file_spec, **kwargs):
590589

591590

592591
if __name__ == '__main__':
592+
flatten_authors()
593593
flatten_concepts()
594-
flatten_venues()
595594
flatten_institutions()
596-
flatten_authors()
595+
flatten_sources()
597596
flatten_works()
598597

0 commit comments

Comments
 (0)