|
10 | 10 | FILES_PER_ENTITY = int(os.environ.get('OPENALEX_DEMO_FILES_PER_ENTITY', '0'))
|
11 | 11 |
|
12 | 12 | csv_files = {
|
13 |
| - 'institutions': { |
14 |
| - 'institutions': { |
15 |
| - 'name': os.path.join(CSV_DIR, 'institutions.csv.gz'), |
16 |
| - 'columns': [ |
17 |
| - 'id', 'ror', 'display_name', 'country_code', 'type', 'homepage_url', 'image_url', 'image_thumbnail_url', |
18 |
| - 'display_name_acroynyms', 'display_name_alternatives', 'works_count', 'cited_by_count', 'works_api_url', |
19 |
| - 'updated_date' |
20 |
| - ] |
21 |
| - }, |
22 |
| - 'ids': { |
23 |
| - 'name': os.path.join(CSV_DIR, 'institutions_ids.csv.gz'), |
24 |
| - 'columns': [ |
25 |
| - 'institution_id', 'openalex', 'ror', 'grid', 'wikipedia', 'wikidata', 'mag' |
26 |
| - ] |
27 |
| - }, |
28 |
| - 'geo': { |
29 |
| - 'name': os.path.join(CSV_DIR, 'institutions_geo.csv.gz'), |
30 |
| - 'columns': [ |
31 |
| - 'institution_id', 'city', 'geonames_city_id', 'region', 'country_code', 'country', 'latitude', |
32 |
| - 'longitude' |
33 |
| - ] |
34 |
| - }, |
35 |
| - 'associated_institutions': { |
36 |
| - 'name': os.path.join(CSV_DIR, 'institutions_associated_institutions.csv.gz'), |
37 |
| - 'columns': [ |
38 |
| - 'institution_id', 'associated_institution_id', 'relationship' |
39 |
| - ] |
40 |
| - }, |
41 |
| - 'counts_by_year': { |
42 |
| - 'name': os.path.join(CSV_DIR, 'institutions_counts_by_year.csv.gz'), |
43 |
| - 'columns': [ |
44 |
| - 'institution_id', 'year', 'works_count', 'cited_by_count' |
45 |
| - ] |
46 |
| - } |
47 |
| - }, |
48 | 13 | 'authors': {
|
49 | 14 | 'authors': {
|
50 | 15 | 'name': os.path.join(CSV_DIR, 'authors.csv.gz'),
|
|
91 | 56 | 'columns': ['concept_id', 'related_concept_id', 'score']
|
92 | 57 | }
|
93 | 58 | },
|
94 |
| - 'venues': { |
95 |
| - 'venues': { |
96 |
| - 'name': os.path.join(CSV_DIR, 'venues.csv.gz'), |
| 59 | + 'institutions': { |
| 60 | + 'institutions': { |
| 61 | + 'name': os.path.join(CSV_DIR, 'institutions.csv.gz'), |
| 62 | + 'columns': [ |
| 63 | + 'id', 'ror', 'display_name', 'country_code', 'type', 'homepage_url', 'image_url', 'image_thumbnail_url', |
| 64 | + 'display_name_acroynyms', 'display_name_alternatives', 'works_count', 'cited_by_count', 'works_api_url', |
| 65 | + 'updated_date' |
| 66 | + ] |
| 67 | + }, |
| 68 | + 'ids': { |
| 69 | + 'name': os.path.join(CSV_DIR, 'institutions_ids.csv.gz'), |
| 70 | + 'columns': [ |
| 71 | + 'institution_id', 'openalex', 'ror', 'grid', 'wikipedia', 'wikidata', 'mag' |
| 72 | + ] |
| 73 | + }, |
| 74 | + 'geo': { |
| 75 | + 'name': os.path.join(CSV_DIR, 'institutions_geo.csv.gz'), |
| 76 | + 'columns': [ |
| 77 | + 'institution_id', 'city', 'geonames_city_id', 'region', 'country_code', 'country', 'latitude', |
| 78 | + 'longitude' |
| 79 | + ] |
| 80 | + }, |
| 81 | + 'associated_institutions': { |
| 82 | + 'name': os.path.join(CSV_DIR, 'institutions_associated_institutions.csv.gz'), |
| 83 | + 'columns': [ |
| 84 | + 'institution_id', 'associated_institution_id', 'relationship' |
| 85 | + ] |
| 86 | + }, |
| 87 | + 'counts_by_year': { |
| 88 | + 'name': os.path.join(CSV_DIR, 'institutions_counts_by_year.csv.gz'), |
| 89 | + 'columns': [ |
| 90 | + 'institution_id', 'year', 'works_count', 'cited_by_count' |
| 91 | + ] |
| 92 | + } |
| 93 | + }, |
| 94 | + 'sources': { |
| 95 | + 'sources': { |
| 96 | + 'name': os.path.join(CSV_DIR, 'sources.csv.gz'), |
97 | 97 | 'columns': [
|
98 | 98 | 'id', 'issn_l', 'issn', 'display_name', 'publisher', 'works_count', 'cited_by_count', 'is_oa',
|
99 | 99 | 'is_in_doaj', 'homepage_url', 'works_api_url', 'updated_date'
|
100 | 100 | ]
|
101 | 101 | },
|
102 | 102 | 'ids': {
|
103 |
| - 'name': os.path.join(CSV_DIR, 'venues_ids.csv.gz'), |
104 |
| - 'columns': ['venue_id', 'openalex', 'issn_l', 'issn', 'mag'] |
| 103 | + 'name': os.path.join(CSV_DIR, 'sources_ids.csv.gz'), |
| 104 | + 'columns': ['source_id', 'openalex', 'issn_l', 'issn', 'mag'] |
105 | 105 | },
|
106 | 106 | 'counts_by_year': {
|
107 |
| - 'name': os.path.join(CSV_DIR, 'venues_counts_by_year.csv.gz'), |
108 |
| - 'columns': ['venue_id', 'year', 'works_count', 'cited_by_count'] |
| 107 | + 'name': os.path.join(CSV_DIR, 'sources_counts_by_year.csv.gz'), |
| 108 | + 'columns': ['source_id', 'year', 'works_count', 'cited_by_count'] |
109 | 109 | },
|
110 | 110 | },
|
111 | 111 | 'works': {
|
|
180 | 180 | }
|
181 | 181 |
|
182 | 182 |
|
| 183 | +def flatten_authors(): |
| 184 | + file_spec = csv_files['authors'] |
| 185 | + |
| 186 | + with gzip.open(file_spec['authors']['name'], 'wt', encoding='utf-8') as authors_csv, \ |
| 187 | + gzip.open(file_spec['ids']['name'], 'wt', encoding='utf-8') as ids_csv, \ |
| 188 | + gzip.open(file_spec['counts_by_year']['name'], 'wt', encoding='utf-8') as counts_by_year_csv: |
| 189 | + |
| 190 | + authors_writer = csv.DictWriter( |
| 191 | + authors_csv, fieldnames=file_spec['authors']['columns'], extrasaction='ignore' |
| 192 | + ) |
| 193 | + authors_writer.writeheader() |
| 194 | + |
| 195 | + ids_writer = csv.DictWriter(ids_csv, fieldnames=file_spec['ids']['columns']) |
| 196 | + ids_writer.writeheader() |
| 197 | + |
| 198 | + counts_by_year_writer = csv.DictWriter(counts_by_year_csv, fieldnames=file_spec['counts_by_year']['columns']) |
| 199 | + counts_by_year_writer.writeheader() |
| 200 | + |
| 201 | + files_done = 0 |
| 202 | + for jsonl_file_name in glob.glob(os.path.join(SNAPSHOT_DIR, 'data', 'authors', '*', '*.gz')): |
| 203 | + print(jsonl_file_name) |
| 204 | + with gzip.open(jsonl_file_name, 'r') as authors_jsonl: |
| 205 | + for author_json in authors_jsonl: |
| 206 | + if not author_json.strip(): |
| 207 | + continue |
| 208 | + |
| 209 | + author = json.loads(author_json) |
| 210 | + |
| 211 | + if not (author_id := author.get('id')): |
| 212 | + continue |
| 213 | + |
| 214 | + # authors |
| 215 | + author['display_name_alternatives'] = json.dumps(author.get('display_name_alternatives'), ensure_ascii=False) |
| 216 | + author['last_known_institution'] = (author.get('last_known_institution') or {}).get('id') |
| 217 | + authors_writer.writerow(author) |
| 218 | + |
| 219 | + # ids |
| 220 | + if author_ids := author.get('ids'): |
| 221 | + author_ids['author_id'] = author_id |
| 222 | + ids_writer.writerow(author_ids) |
| 223 | + |
| 224 | + # counts_by_year |
| 225 | + if counts_by_year := author.get('counts_by_year'): |
| 226 | + for count_by_year in counts_by_year: |
| 227 | + count_by_year['author_id'] = author_id |
| 228 | + counts_by_year_writer.writerow(count_by_year) |
| 229 | + files_done += 1 |
| 230 | + if FILES_PER_ENTITY and files_done >= FILES_PER_ENTITY: |
| 231 | + break |
| 232 | + |
| 233 | + |
183 | 234 | def flatten_concepts():
|
184 | 235 | with gzip.open(csv_files['concepts']['concepts']['name'], 'wt', encoding='utf-8') as concepts_csv, \
|
185 | 236 | gzip.open(csv_files['concepts']['ancestors']['name'], 'wt', encoding='utf-8') as ancestors_csv, \
|
@@ -256,57 +307,6 @@ def flatten_concepts():
|
256 | 307 | break
|
257 | 308 |
|
258 | 309 |
|
259 |
| -def flatten_venues(): |
260 |
| - with gzip.open(csv_files['venues']['venues']['name'], 'wt', encoding='utf-8') as venues_csv, \ |
261 |
| - gzip.open(csv_files['venues']['ids']['name'], 'wt', encoding='utf-8') as ids_csv, \ |
262 |
| - gzip.open(csv_files['venues']['counts_by_year']['name'], 'wt', encoding='utf-8') as counts_by_year_csv: |
263 |
| - |
264 |
| - venues_writer = csv.DictWriter( |
265 |
| - venues_csv, fieldnames=csv_files['venues']['venues']['columns'], extrasaction='ignore' |
266 |
| - ) |
267 |
| - venues_writer.writeheader() |
268 |
| - |
269 |
| - ids_writer = csv.DictWriter(ids_csv, fieldnames=csv_files['venues']['ids']['columns']) |
270 |
| - ids_writer.writeheader() |
271 |
| - |
272 |
| - counts_by_year_writer = csv.DictWriter(counts_by_year_csv, fieldnames=csv_files['venues']['counts_by_year']['columns']) |
273 |
| - counts_by_year_writer.writeheader() |
274 |
| - |
275 |
| - seen_venue_ids = set() |
276 |
| - |
277 |
| - files_done = 0 |
278 |
| - for jsonl_file_name in glob.glob(os.path.join(SNAPSHOT_DIR, 'data', 'venues', '*', '*.gz')): |
279 |
| - print(jsonl_file_name) |
280 |
| - with gzip.open(jsonl_file_name, 'r') as venues_jsonl: |
281 |
| - for venue_json in venues_jsonl: |
282 |
| - if not venue_json.strip(): |
283 |
| - continue |
284 |
| - |
285 |
| - venue = json.loads(venue_json) |
286 |
| - |
287 |
| - if not (venue_id := venue.get('id')) or venue_id in seen_venue_ids: |
288 |
| - continue |
289 |
| - |
290 |
| - seen_venue_ids.add(venue_id) |
291 |
| - |
292 |
| - venue['issn'] = json.dumps(venue.get('issn')) |
293 |
| - venues_writer.writerow(venue) |
294 |
| - |
295 |
| - if venue_ids := venue.get('ids'): |
296 |
| - venue_ids['venue_id'] = venue_id |
297 |
| - venue_ids['issn'] = json.dumps(venue_ids.get('issn')) |
298 |
| - ids_writer.writerow(venue_ids) |
299 |
| - |
300 |
| - if counts_by_year := venue.get('counts_by_year'): |
301 |
| - for count_by_year in counts_by_year: |
302 |
| - count_by_year['venue_id'] = venue_id |
303 |
| - counts_by_year_writer.writerow(count_by_year) |
304 |
| - |
305 |
| - files_done += 1 |
306 |
| - if FILES_PER_ENTITY and files_done >= FILES_PER_ENTITY: |
307 |
| - break |
308 |
| - |
309 |
| - |
310 | 310 | def flatten_institutions():
|
311 | 311 | file_spec = csv_files['institutions']
|
312 | 312 |
|
@@ -390,58 +390,57 @@ def flatten_institutions():
|
390 | 390 | break
|
391 | 391 |
|
392 | 392 |
|
393 |
| -def flatten_authors(): |
394 |
| - file_spec = csv_files['authors'] |
395 |
| - |
396 |
| - with gzip.open(file_spec['authors']['name'], 'wt', encoding='utf-8') as authors_csv, \ |
397 |
| - gzip.open(file_spec['ids']['name'], 'wt', encoding='utf-8') as ids_csv, \ |
398 |
| - gzip.open(file_spec['counts_by_year']['name'], 'wt', encoding='utf-8') as counts_by_year_csv: |
| 393 | +def flatten_sources(): |
| 394 | + with gzip.open(csv_files['sources']['sources']['name'], 'wt', encoding='utf-8') as sources_csv, \ |
| 395 | + gzip.open(csv_files['sources']['ids']['name'], 'wt', encoding='utf-8') as ids_csv, \ |
| 396 | + gzip.open(csv_files['sources']['counts_by_year']['name'], 'wt', encoding='utf-8') as counts_by_year_csv: |
399 | 397 |
|
400 |
| - authors_writer = csv.DictWriter( |
401 |
| - authors_csv, fieldnames=file_spec['authors']['columns'], extrasaction='ignore' |
| 398 | + sources_writer = csv.DictWriter( |
| 399 | + sources_csv, fieldnames=csv_files['sources']['sources']['columns'], extrasaction='ignore' |
402 | 400 | )
|
403 |
| - authors_writer.writeheader() |
| 401 | + sources_writer.writeheader() |
404 | 402 |
|
405 |
| - ids_writer = csv.DictWriter(ids_csv, fieldnames=file_spec['ids']['columns']) |
| 403 | + ids_writer = csv.DictWriter(ids_csv, fieldnames=csv_files['sources']['ids']['columns']) |
406 | 404 | ids_writer.writeheader()
|
407 | 405 |
|
408 |
| - counts_by_year_writer = csv.DictWriter(counts_by_year_csv, fieldnames=file_spec['counts_by_year']['columns']) |
| 406 | + counts_by_year_writer = csv.DictWriter(counts_by_year_csv, fieldnames=csv_files['sources']['counts_by_year']['columns']) |
409 | 407 | counts_by_year_writer.writeheader()
|
410 | 408 |
|
| 409 | + seen_source_ids = set() |
| 410 | + |
411 | 411 | files_done = 0
|
412 |
| - for jsonl_file_name in glob.glob(os.path.join(SNAPSHOT_DIR, 'data', 'authors', '*', '*.gz')): |
| 412 | + for jsonl_file_name in glob.glob(os.path.join(SNAPSHOT_DIR, 'data', 'sources', '*', '*.gz')): |
413 | 413 | print(jsonl_file_name)
|
414 |
| - with gzip.open(jsonl_file_name, 'r') as authors_jsonl: |
415 |
| - for author_json in authors_jsonl: |
416 |
| - if not author_json.strip(): |
| 414 | + with gzip.open(jsonl_file_name, 'r') as sources_jsonl: |
| 415 | + for source_json in sources_jsonl: |
| 416 | + if not source_json.strip(): |
417 | 417 | continue
|
418 | 418 |
|
419 |
| - author = json.loads(author_json) |
| 419 | + source = json.loads(source_json) |
420 | 420 |
|
421 |
| - if not (author_id := author.get('id')): |
| 421 | + if not (source_id := source.get('id')) or source_id in seen_source_ids: |
422 | 422 | continue
|
423 | 423 |
|
424 |
| - # authors |
425 |
| - author['display_name_alternatives'] = json.dumps(author.get('display_name_alternatives'), ensure_ascii=False) |
426 |
| - author['last_known_institution'] = (author.get('last_known_institution') or {}).get('id') |
427 |
| - authors_writer.writerow(author) |
| 424 | + seen_source_ids.add(source_id) |
428 | 425 |
|
429 |
| - # ids |
430 |
| - if author_ids := author.get('ids'): |
431 |
| - author_ids['author_id'] = author_id |
432 |
| - ids_writer.writerow(author_ids) |
| 426 | + source['issn'] = json.dumps(source.get('issn')) |
| 427 | + sources_writer.writerow(source) |
433 | 428 |
|
434 |
| - # counts_by_year |
435 |
| - if counts_by_year := author.get('counts_by_year'): |
| 429 | + if source_ids := source.get('ids'): |
| 430 | + source_ids['source_id'] = source_id |
| 431 | + source_ids['issn'] = json.dumps(source_ids.get('issn')) |
| 432 | + ids_writer.writerow(source_ids) |
| 433 | + |
| 434 | + if counts_by_year := source.get('counts_by_year'): |
436 | 435 | for count_by_year in counts_by_year:
|
437 |
| - count_by_year['author_id'] = author_id |
| 436 | + count_by_year['source_id'] = source_id |
438 | 437 | counts_by_year_writer.writerow(count_by_year)
|
| 438 | + |
439 | 439 | files_done += 1
|
440 | 440 | if FILES_PER_ENTITY and files_done >= FILES_PER_ENTITY:
|
441 | 441 | break
|
442 | 442 |
|
443 | 443 |
|
444 |
| - |
445 | 444 | def flatten_works():
|
446 | 445 | file_spec = csv_files['works']
|
447 | 446 |
|
@@ -590,9 +589,9 @@ def init_dict_writer(csv_file, file_spec, **kwargs):
|
590 | 589 |
|
591 | 590 |
|
592 | 591 | if __name__ == '__main__':
|
| 592 | + flatten_authors() |
593 | 593 | flatten_concepts()
|
594 |
| - flatten_venues() |
595 | 594 | flatten_institutions()
|
596 |
| - flatten_authors() |
| 595 | + flatten_sources() |
597 | 596 | flatten_works()
|
598 | 597 |
|
0 commit comments