forked from Yasin-VU/simple-python-bibliometrics
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcore_functions.py
More file actions
2249 lines (1915 loc) · 96.2 KB
/
core_functions.py
File metadata and controls
2249 lines (1915 loc) · 96.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Core functions
#
# this file contains reusable core functions like filtering on university
# and adding year and month name info
# these are functions which are generally used in every product
# roadmap: I want to push all functions from loose function
# to functions combined in classgroups
from nlp_functions import remove_punctuation
from nlp_functions import get_abstract_if_any
from nlp_functions import comma_space_fix
#from static import PATH_START, PATH_START_PERSONAL
#from static import PATH_START_SERVER , PATH_START_PERSONAL_SERVER
#from static import UNPAYWALL_EMAIL
#from static import PATH_STATIC_RESPONSES
#from static import PATH_STATIC_RESPONSES_ALTMETRIC
#from static import PATH_STATIC_RESPONSES_SCOPUS_ABS
#from static import MAX_NUM_WORKERS # not used everywhere so care
import pandas as pd
import calendar
import numpy as np
import requests
from pybliometrics.scopus import ScopusSearch
from pybliometrics.scopus import AbstractRetrieval
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import ProcessPoolExecutor
from functools import partial
### from functools import wraps
import time
from datetime import datetime # new
from datetime import timedelta
import re
import mysql.connector
from mysql.connector import Error
from altmetric import Altmetric
import pickle
import functools
from unittest.mock import Mock
from requests.models import Response
#import sys
from nlp_functions import faculty_finder
from pybliometrics.scopus import config
from pybliometrics.scopus.exception import Scopus429Error
import static
def overloaded_abstract_retrieval(identifier, view='FULL', refresh=True, id_type='eid'):
"""
The only thing this extra layer does is swap api-keys on error 429
Any multi-threading etc is done elsewhere (and may need its own testing as always)
"""
try:
res = AbstractRetrieval(identifier=identifier, view=view, refresh=refresh, id_type=id_type)
time.sleep(0.05)
except Scopus429Error:
# Use the last item of _keys, drop it and assign it as
# current API key
# update: keep swapping until it works
still_error = True
while still_error:
if len(static.SCOPUS_KEYS) > 0:
config["Authentication"]["APIKey"] = static.SCOPUS_KEYS.pop()
try:
time.sleep(1) # only when key has changed so 1s is fine
res = AbstractRetrieval(identifier=identifier, view=view, refresh=refresh, id_type=id_type)
still_error = False
except Scopus429Error: # NO! only for 429
print('error, key pop will happen at top of while top')
except:
print('non429 error')
still_error = False
res = None # ?
else:
still_error = False
res = None # ?
return res
def make_doi_list_from_csv(source_path, output_path, do_return=True):
# this function returns a list of DOIs from a source scopus frontend file
# in: source_path: a full path ending with .csv which contains a csv which has a column 'DOI'
# output_path: a full path ending with .csv which will be where the result is returned as csv
# out: a csv is generated and saved, and is returned as dataframe as well
#
df = pd.read_csv(source_path)
df[~df.DOI.isnull()].DOI.to_csv(output_path, header=False)
if do_return:
return df[~df.DOI.isnull()].DOI
else:
return None
def filter_on_uni(df_in, affiliation_column, cur_uni, affiliation_dict_basic):
"""" returns the dataframe filtered on the chosen university
in: df with column 'Scopus affiliation IDs' with list of affiliation ids in scopus style
cur_uni: a university name appearing in the dictionary affiliation_dict_basic
affiliation_dict_basic: a dictionary with keys unis and values affiliation ids
out: df filtered over rows
"""
# now the return has all info per university
# ! scival may change their delimiters here, so please check once a while if it works as intended
# put an extra check here to be safe
return df_in[df_in.apply(lambda x: not (set(x[affiliation_column].split('| '))
.isdisjoint(set(affiliation_dict_basic[cur_uni]))), axis=1)]
def add_year_and_month_old(df_in, date_col):
"""" adds two columns to a dataframe: a year and a month
in: df_in: dataframe with special column (read below)
date_col: name of column which has data information, formatted as [start]YYYY[any 1 char]MM[anything][end]
column must not have Nones or nans for example
out: dataframe with extra columns for year and month
"""
df_in['year'] = df_in[date_col].apply(lambda x: x[0:4])
df_in['month'] = df_in[date_col].apply(lambda x: x[5:7])
df_in['month_since_2018'] = df_in.month.astype('int') + (df_in.year.astype('int')-2018)*12
df_in['month_name'] = df_in.month.astype('int').apply(lambda x: calendar.month_name[x])
return df_in
def add_year_and_month(df_in, date_col):
"""" adds two columns to a dataframe: a year and a month
in: df_in: dataframe with special column (read below)
date_col: name of column which has data information, formatted as [start]YYYY[any 1 char]MM[anything][end]
column must not have Nones or nans for example
out: dataframe with extra columns for year and month
"""
df_in['year'] = df_in[date_col].apply(lambda x: None if x is None else x[0:4])
df_in['month'] = df_in[date_col].apply(lambda x: None if x is None else x[5:7])
df_in['month_since_2018'] = df_in.apply(lambda x: None if x.month is None else int(x.month) + (int(x.year)-2018)*12, axis=1)
#df_in.month.astype('int') + (df_in.year.astype('int')-2018)*12
df_in['month_name'] = df_in.month.apply(lambda x: None if x is None else calendar.month_name[int(x)])
return df_in
def add_pure_year(df_in, date_col='Current publication status > Date'):
"""" adds one columns to a dataframe: a 'pure_year' based on pure info.
The input must fit the PURE form as 'Anything+YY'
We assume the year is after 2000! there are no checks for this
in: df_in: dataframe with special column (read below)
date_col: name of column which has data information, formatted as [start][anything]YYYY[end]
column must not have Nones or nans for example
out: dataframe with extra columns for year and month
"""
if date_col is None:
df_in['pure_year'] = np.nan
else:
df_in['pure_year'] = df_in[date_col].apply(lambda x: float('20' + x[-2:]))
return df_in
def get_scopus_abstract_info(paper_eid):
"""
Returns the users df_in with extra columns with scopus abstract info per row or with diagnostics
:param df_in: must have doi and eid
:return:
"""
# init
no_author_group = True # we want this too
error = False
ab = None
error_message = 'no error'
if paper_eid == None:
# paper_without eid
error_message = 'paper eid is none'
error = True
else:
try:
ab = overloaded_abstract_retrieval(identifier=paper_eid, view='FULL', refresh=True, id_type='eid')
except:
error = True
error_message = 'abstract api error'
if not(error):
# chk if API errors out on authorgroup call and log it
try:
ab.authorgroup
no_author_group = False
except:
no_author_group = True
##### this belongs in another function, with its own diagnostics + only run ff if this succeeds in topfn
####if not(no_author_group):
#### (bool_got_vu_author, a, b) = find_first_vu_author() # yet to make this
# also if no error, save the result for returns
return {'abstract_object': ab,
'no_author_group_warning': no_author_group,
'abstract_error': error,
'abstract_error_message': error_message}
def split_scopus_subquery_affils(subquery_affils, number_of_splits=4,
subquery_time = ''):
"""
! This function needs testing
This function takes in subquery_affils from make_affiliation_dicts_afids()
and translates it into a list of subqueries to avoid query length limits
in: subquery_affils from make_affiliation_dicts_afids()
number_of_splits: an integer between 2 and 10
subquery_time: an optional query to paste after every subquery
out: a list of subqueries to constrain scopussearch to a subset of affils
during stacking be sure to de-duplicate (recommended on EID)
"""
if (number_of_splits <= 10) & (number_of_splits > 1) & (number_of_splits % 1 == 0):
pass # valid number_of_splits
# you do not have to worry about number_of_splits < #afids because
# in python asking indices range outside indices range yields empty lists
# s.t. stacking them here does nothing
# needs checking though
else:
print('invalid number_of_splits, replacing with 4')
number_of_splits = 4
affil_count = len(subquery_affils.split('OR')) # number of affiliation ids
if affil_count <= 12: # to avoid weird situations
print('affil_count is small, returning single subquery')
my_query_set = subquery_affils + subquery_time
else:
# do it
my_query_set = []
step_size = int(np.floor(affil_count / number_of_splits)+1)
counter = 0
for cur_step in np.arange(0,number_of_splits):
if counter == 0:
cur_subquery = 'OR'.join(subquery_affils.split('OR')[0:step_size]) + ' ) '
elif counter == number_of_splits-1: # this is the last one
cur_subquery = ' ( ' + 'OR'.join(subquery_affils.split('OR')[step_size*cur_step:step_size*(cur_step+1)]) # + ' ) ) '
else:
cur_subquery = ' ( ' + 'OR'.join(subquery_affils.split('OR')[step_size*cur_step:step_size*(cur_step+1)]) + ' ) '
# stack results in a list, check if we need extra [] or not !
cur_subquery = cur_subquery + subquery_time
my_query_set.append(cur_subquery)
counter = counter + 1 # useless but OK
#print('-----')
#print(my_query_set)
#print('-----')
return my_query_set
def get_first_chosen_affiliation_author(ab, chosen_affid):
"""
:param ab:
:return:
"""
# init
first_vu_author = None
cur_org = None
has_error = False
first_vu_author_position = None # care reverse!!! you need a length here or extra unreverse
try:
# loop over the authors in the author group, back to front, s.t. the 'first' vu author overwrites everything
# this is not ideal,
# because we would also want to check the second vu-author if first one can't be traced back to a faculty
for cntr, author in enumerate(ab.authorgroup[::-1]): # ensures the final vu_author result is the leading vu author
if author.affiliation_id == None:
# then we can't match as vu author (yet), so we just skip as we do non-vu authors
1
else:
if not (set(author.affiliation_id.split(', ')).isdisjoint(set(chosen_affid))):
cur_org = author.organization
if author.given_name == None:
author_given_name = '?'
else:
author_given_name = author.given_name
if author.surname == None:
author_surname = '?'
else:
author_surname = author.surname
first_vu_author = author_given_name + ' ' + author_surname
except:
has_error = True
return {'first_affil_author': first_vu_author,
'first_affil_author_org': cur_org,
'first_affil_author_has_error': has_error}
def get_count_of_chosen_affiliation_authors(ab, chosen_affid):
"""
:param ab:
:return:
"""
# init
author_count_valid = False
author_count = 0
has_error = False
try:
# loop over the authors in the author group, back to front, s.t. the 'first' vu author overwrites everything
# this is not ideal,
# because we would also want to check the second vu-author if first one can't be traced back to a faculty
for cntr, author in enumerate(ab.authorgroup[::-1]): # ensures the final vu_author result is the leading vu author
if author.affiliation_id == None:
# then we can't match as vu author (yet), so we just skip as we do non-vu authors
1
else:
if not (set(author.affiliation_id.split(', ')).isdisjoint(set(chosen_affid))):
# then we have a vu-author. Count and continue
# notice there is no safety net if an author appears multiple times for some reason
author_count = author_count + 1
author_count_valid = True
except:
has_error = True
# then the author_count_valid remains False
return {'affil_author_count': author_count,
'affil_author_count_valid': author_count_valid,
'affil_author_count_has_error': has_error}
# upw start
## 1st at bottom
## 2nd
# remember, these are not for general purpose, but specific decorators for api-harvester-type functions crystal_()
def check_id_validity(func):
# first layer is a pass right now and that is OK
def decorator_check_id_validity(func):
@functools.wraps(func)
def wrapper_check_id_validity(cur_id, my_requests):
#
# pre-process
valid_doi_probably = False
if cur_id is not None:
if pd.notnull(cur_id):
if cur_id != 'nan':
try:
cur_id = cur_id.lower()
valid_doi_probably = True
except:
try:
cur_id = str(cur_id).lower() # not sure but OK
valid_doi_probably = True # stay on safe side then and loose tiny bit of performance
except:
# then give up
print('warning: failed to str(cur_doi).lower()')
if not valid_doi_probably:
# chance cur_id s.t. the crystal function can skip the checks and directly insert invalid-id-result
cur_id = 'invalid' # the only change
# end of pre-process
#
# run the core function
r, relevant_keys, cur_id_lower, prepend, id_type = func(cur_id, my_requests)
#
# no post-process
#
return r, relevant_keys, cur_id_lower, prepend, id_type
return wrapper_check_id_validity
return decorator_check_id_validity(func)
#############################################add_deal_info
## 3rd
def check_errors_and_parse_outputs(func):
# first layer is a pass right now and that is OK
def decorator_check_errors_and_parse_outputs(func):
@functools.wraps(func)
def wrapper_check_errors_and_parse_outputs(cur_id, my_requests=requests): # !!!!
#
# pre-processing
#
#
r, relevant_keys, cur_id_lower, prepend, id_type = func(cur_id, my_requests)
#
# post-processing
#
# init a dict and fill with right keys and zeros
dict_init = {} # values are filled with None as starting point
for key in relevant_keys:
dict_init[prepend + key] = None # really init empty and stays empty if error
dict_init[prepend + id_type] = None # can only be data['doi'] (!) # legacy
dict_init[prepend + id_type + '_lowercase'] = cur_id_lower
dict_init['own_' + id_type + '_lowercase'] = cur_id_lower
dict_init['orig_' + id_type] = cur_id # legacy
#
dict_to_add = dict_init
# ! somehow need to recognize doi_lowercase too...
#
try:
if 'error' in r.json().keys():
# the following code has been checked to work as intended
has_error = True
error_message = r.json()['message']
dict_to_add[prepend + 'error'] = has_error
dict_to_add[prepend + 'error_message'] = error_message
#
else:
# case: no error
#print(r)
#print(r.json())
has_error = False
error_message = 'no error'
dict_to_add[prepend + 'error'] = has_error
dict_to_add[prepend + 'error_message'] = error_message
#
# get data
try:
data = r.json()['results'][0]
except:
data = r.json()
# overwrite dict_to_add with data
for key in relevant_keys:
try:
dict_to_add[prepend + key] = data[key] # even upw_doi goes automatically : )
except KeyError:
dict_to_add[prepend + key] = None # if the key is not there, the result is None
dict_to_add[prepend + id_type] = cur_id # fix
except:
has_error = True
error_message = "error in r.json() or deeper"
dict_to_add[prepend + 'error'] = has_error
dict_to_add[prepend + 'error_message'] = error_message
#
return pd.Series(dict_to_add) # r, relevant_keys # different output # output has been changed
return wrapper_check_errors_and_parse_outputs
return decorator_check_errors_and_parse_outputs(func)
#############################################
## 4th
def faster(func):
# makes stuff for lists of ids and enables multi-threading and persistent sessions : ) amazing
# first layer is a pass right now and that is OK
def decorator_iterate_list(func):
@functools.wraps(func)
def wrapper_iterate_list(doi_list, silent=True, multi_thread=True, my_requests=None, allow_session_creation=True):
""" returns unpaywall info for a given doi list, includes result success/failure and diagnostics
:param doi_list: doi list as a list of strings, re-computes if doi are duplicate
does not de-dupe or dropna for generality, but you can do doi_list = df_in.doi.dropna().unique()
if you so desire
silent: whether you want silent behaviour or not, defaults to printing nothing
multi_thread: whether you want to multi_thread unpaywall (code has been tested), on by default
you do not have to worry about worker counts, a default law is integrated for that
my_requests: by default None, but can be exchanged for a requests-session on demand
with default, called functions will themselves enter 'requests' to reduce communication costs
allow_session_creation: if my_requests=None, this allows the fn to make its own session
:return: subset of unpaywall columns info + diagnostics as a pandas DataFrame, vertically doi's in lowercase-form.
duplicate doi's in the list are ignored, and the output has 1 row per unique DOI
Notice: this should be the only function to call fn_get_upw_info for more than 1 DOI (for developers)
, s.t. the multi-threading code can be here without duplicate code
"""
# all processing
# empty dataframe
df_unpaywall = pd.DataFrame()
if multi_thread: # valid across session used or not
max_num_workers = static.MAX_NUM_WORKERS
num_workers = np.max(
[1, int(np.floor(np.min([max_num_workers, np.floor(float(len(doi_list)) / 4.0)])))])
if (my_requests is None) & (allow_session_creation is True) & (len(doi_list) >= 20):
# then optionally make your own session # + avoid overhead for small jobs
# perform with a session
with requests.Session() as sessionA:
if multi_thread:
fn_get_upw_info_partial = partial(func,
my_requests=sessionA) # avoid communication costs
multi_result = multithreading(fn_get_upw_info_partial,
doi_list,
num_workers)
for cur_series in multi_result:
df_unpaywall = df_unpaywall.append(cur_series, ignore_index=True)
else: # single thread
for (counter, cur_doi) in enumerate(doi_list):
if silent == False:
print(
'unpaywall busy with number ' + str(counter + 1) + ' out of ' + str(len(doi_list)))
cur_res = func(cur_doi, my_requests=sessionA)
df_unpaywall = df_unpaywall.append(cur_res, ignore_index=True)
else:
# perform without a session
if multi_thread:
fn_get_upw_info_partial = partial(func,
my_requests=my_requests) # avoid communication costs
multi_result = multithreading(fn_get_upw_info_partial,
doi_list,
num_workers)
for cur_series in multi_result:
df_unpaywall = df_unpaywall.append(cur_series, ignore_index=True)
else: # single thread
for (counter, cur_doi) in enumerate(doi_list):
if silent == False:
print('unpaywall busy with number ' + str(counter + 1) + ' out of ' + str(len(doi_list)))
cur_res = func(cur_doi, my_requests=my_requests)
df_unpaywall = df_unpaywall.append(cur_res, ignore_index=True)
# either way, return the result
return df_unpaywall
return wrapper_iterate_list
return decorator_iterate_list(func)
## 5th
def appender(func, cur_id_name='doi'):
"""
Returns the given dataframe with extra columns with unpaywall info and result success/failure and diagnostics
Merging is done with lower-cased DOI's to avoid duplicate issues. The DOI name is case-insensitive
:param df_in: df_in as a pandas dataframe, must have a column named 'doi' with doi's as string
:return: pandas dataframe with extra columns with subset of unpaywall info and result success/failure and diagnostic
all new doi info is lowercase
"""
def decorator_appender(func):
@functools.wraps(func)
def wrapper_appender(df_in, silent=True, cut_dupes=False, avoid_double_work=True,
multi_thread=True, my_requests=None, allow_session_creation=True):
if cur_id_name == 'eid':
print('warning: scopus abstract accelerator has not been validated yet !')
# make doi_list
if avoid_double_work:
doi_list = df_in.drop_duplicates(cur_id_name)[cur_id_name].to_list() # notice no dropna to keep functionality the same
# also no lower-dropna for simplicity
else:
doi_list = df_in[cur_id_name].to_list()
if cut_dupes:
print('deprecated code running')
# I think it should yield exactly the same result, but needs testing that is all
# overwrites
doi_list = df_in[cur_id_name].dropna().unique()
# get unpaywall info
df_unpaywall = func(doi_list, silent, multi_thread, my_requests, allow_session_creation)
# merge to add columns
# prepare doi_lower
df_in.loc[:, 'id_lowercase'] = df_in[cur_id_name].str.lower()
df_merged = df_in.merge(df_unpaywall.drop_duplicates('own_' + cur_id_name + '_lowercase'),
left_on='id_lowercase', right_on='own_' + cur_id_name + '_lowercase', how='left')
# drop duplicates in df_unpaywall to avoid having duplicates in the result due repeating DOI's or Nones
# assumption: all none returns are the exact same
if not silent:
print('done with add_unpaywall_columns')
return df_merged
return wrapper_appender
return decorator_appender(func)
@appender
@faster
@check_errors_and_parse_outputs
@check_id_validity
def crystal_unpaywall(cur_id, my_requests):
# always use cur_id, my_requests for in and r, relevant_keys for out
# id is either cur_doi or 'invalid' if invalid
prepend = 'upw_'
id_type = 'doi'
cur_id_lower = cur_id.lower()
if my_requests is None:
my_requests = requests # avoids passing requests around everytime
relevant_keys = ['free_fulltext_url',
'is_boai_license', 'is_free_to_read', 'is_subscription_journal',
'license', 'oa_color'] # , 'doi', 'doi_lowercase' : you get these from callers
if cur_id == 'invalid':
# get the invalid-doi-response directly from disk to save time, you can run update_api_statics to update it
in_file = open(static.PATH_STATIC_RESPONSES, 'rb')
r = pickle.load(in_file)
in_file.close()
else:
r = my_requests.get("https://api.unpaywall.org/" + str(cur_id) + "?email=" + static.UNPAYWALL_EMAIL) # force string
# keep multi_thread to 16 to avoid issues with local computer and in rare occasions the api returns
# this try making the code 10x slower
"""
try:
r = my_requests.get("https://api.unpaywall.org/" + str(cur_id) + "?email=" + UNPAYWALL_EMAIL) # force string
except:
print('request failed hard for unpaywall, filling blank')
in_file = open(PATH_STATIC_RESPONSES, 'rb')
r = pickle.load(in_file)
in_file.close()
"""
return r, relevant_keys, cur_id_lower, prepend, id_type
add_unpaywall_columns = crystal_unpaywall # the final function goes through the new pipe
# recreate the legacy unpaywall functions for now
#
def legacy_crystal_unpaywall(cur_id, my_requests):
# always use cur_id, my_requests for in and r, relevant_keys for out
# id is either cur_doi or 'invalid' if invalid
prepend = 'upw_'
id_type = 'doi'
cur_id_lower = cur_id.lower()
if my_requests is None:
my_requests = requests # avoids passing requests around everytime
relevant_keys = ['free_fulltext_url',
'is_boai_license', 'is_free_to_read', 'is_subscription_journal',
'license', 'oa_color'] # , 'doi', 'doi_lowercase' : you get these from callers
if cur_id == 'invalid':
# get the invalid-doi-response directly from disk to save time, you can run update_api_statics to update it
in_file = open(static.PATH_STATIC_RESPONSES, 'rb')
r = pickle.load(in_file)
in_file.close()
else:
r = my_requests.get("https://api.unpaywall.org/" + str(cur_id) + "?email=" + static.UNPAYWALL_EMAIL) # force string
# keep multi_thread to 16 to avoid issues with local computer and in rare occasions the api returns
# this try making the code 10x slower
"""
try:
r = my_requests.get("https://api.unpaywall.org/" + str(cur_id) + "?email=" + UNPAYWALL_EMAIL) # force string
except:
print('request failed hard for unpaywall, filling blank')
in_file = open(PATH_STATIC_RESPONSES, 'rb')
r = pickle.load(in_file)
in_file.close()
"""
return r, relevant_keys, cur_id_lower, prepend, id_type
fn_get_upw_info = check_errors_and_parse_outputs(check_id_validity(legacy_crystal_unpaywall)) # avoid, legacy
fn_get_all_upw_info = faster(fn_get_upw_info) # these are only for legacy and should be avoided
###add_unpaywall_columns = appender(fn_get_all_upw_info) # the final function goes through the new pipe
#
# I do not like this kind of handling as it breaks some functools functionality
# I will refactor legacy code later some time
@appender
@faster
@check_errors_and_parse_outputs
@check_id_validity
def crystal_altmetric(cur_id, my_requests):
"""
This is a bit annoying because this returns either None or a dictionary, and not a request object...
So I will just send requests without the package
"""
prepend = 'altmetric_'
id_type = 'doi'
cur_id_lower = cur_id.lower()
if my_requests is None:
my_requests = requests # avoids passing requests around everytime
# some settings
api_ver = 'v1' # may change in future, so here it is. For api-key re-edit with altmetric package
api_url = "http://api.altmetric.com/%s/" % api_ver
url = api_url + 'doi' + "/" + cur_id
relevant_keys = ['title', 'cited_by_policies_count', 'score'] # OK for now, care some may miss, patch for that !
# , 'doi', 'doi_lowercase' : you get these from callers
if cur_id == 'invalid':
# get the invalid-doi-response directly from disk to save time, you can run update_api_statics to update it
in_file = open(static.PATH_STATIC_RESPONSES_ALTMETRIC, 'rb')
r = pickle.load(in_file)
in_file.close()
else:
# r = my_requests.get("https://api.unpaywall.org/" + str(cur_id) + "?email=" + UNPAYWALL_EMAIL) # force string
r = my_requests.get(url, params={}, headers={})
return r, relevant_keys, cur_id_lower, prepend, id_type
add_altmetric_columns = crystal_altmetric
###@appender(cur_id_name='eid')
@faster
@check_errors_and_parse_outputs
@check_id_validity
def crystal_scopus_abstract(cur_id, my_requests):
"""
This is a bit annoying because this returns either None or a dictionary, and not a request object...
So I will just send requests without the package
"""
prepend = 'scopus_abstract_'
id_type = 'eid'
cur_id_lower = cur_id.lower() # irrelevant but OK
### not used
###if my_requests is None:
#### my_requests = requests # avoids passing requests around everytime
# some settings
# None
# the issue is that ab is not a requests-type
# but we need requests-type
# also, I do not want to use homebrew request code for it because scopus apis are an outsourced mess
# instead we will use a mock
relevant_keys = ['obje', 'retries'] # all in one, care integration
# , 'doi', 'doi_lowercase' : you get these from callers
if cur_id == 'invalid':
# get the invalid-doi-response directly from disk to save time, you can run update_api_statics to update it
in_file = open(static.PATH_STATIC_RESPONSES_SCOPUS_ABS, 'rb')
r = pickle.load(in_file)
in_file.close()
else:
# r = my_requests.get("https://api.unpaywall.org/" + str(cur_id) + "?email=" + UNPAYWALL_EMAIL) # force string
# r = my_requests.get(url, params={}, headers={})
#
# scopus api is not friendly so I need a try/except here
#
# wait-and-retry
one_shot = False
if one_shot:
retries = 0
try:
ab = overloaded_abstract_retrieval(identifier=cur_id, view='FULL', refresh=True, id_type='eid')
r = Mock(spec=Response)
r.json.return_value = {'obje': pickle.dumps(ab), 'message': 'hi', 'retries':retries}
r.status_code = 999
# requirements:
# r.json().keys
# r.json()['message']
# r.json()['results'] # if not present, will not unpack and use json().keys()
except:
# if so, fall back to invalid routine
#
# get the invalid-doi-response directly from disk to save time, you can run update_api_statics to update it
in_file = open(static.PATH_STATIC_RESPONSES_SCOPUS_ABS, 'rb')
r = pickle.load(in_file)
in_file.close()
else:
# print(one_shot)
retry = True
retries = -1
while retry:
#retry = False # removes retries
retries = retries + 1
try:
ab = overloaded_abstract_retrieval(identifier=cur_id, view='FULL', refresh=True, id_type='eid')
qq = ab.title
qqx = qq + 'x'
#
# if api does not error, and wepuyc have an title, then the call is correct and we got info back successfully
#
# then do rest of actions
r = Mock(spec=Response)
r.json.return_value = {'obje': pickle.dumps(ab), 'message': 'hi', 'retries': retries}
r.status_code = 999
retry = False
except:
# we had an api error or a return with empty information
# either way, just fillna and continue
if retries < 30:
retry = True
time.sleep(1)
if retries > 2:
print('retrying ' + str(retries))
### some returns are caught here as well sadly...
else:
retry = False
# prepare for exit
in_file = open(static.PATH_STATIC_RESPONSES_SCOPUS_ABS, 'rb')
r = pickle.load(in_file)
in_file.close()
# you have to validate this code because scopus has weird features going in which mess up data when overloading
return r, relevant_keys, cur_id_lower, prepend, id_type
crystal_scopus_abstract = appender(func=crystal_scopus_abstract, cur_id_name='eid')
###@appender(cur_id_name='eid')
@faster
@check_errors_and_parse_outputs
@check_id_validity
def crystal_scopus_abstract2(cur_id, my_requests):
"""
This is a bit annoying because this returns either None or a dictionary, and not a request object...
So I will just send requests without the package
2 only gives abstract_text
"""
prepend = 'scopus_abstract_'
id_type = 'eid'
cur_id_lower = cur_id.lower() # irrelevant but OK
### not used
###if my_requests is None:
#### my_requests = requests # avoids passing requests around everytime
# some settings
# None
# the issue is that ab is not a requests-type
# but we need requests-type
# also, I do not want to use homebrew request code for it because scopus apis are an outsourced mess
# instead we will use a mock
relevant_keys = ['text', 'retries'] # all in one, care integration
# , 'doi', 'doi_lowercase' : you get these from callers
if cur_id == 'invalid':
# get the invalid-doi-response directly from disk to save time, you can run update_api_statics to update it
in_file = open(static.PATH_STATIC_RESPONSES_SCOPUS_ABS, 'rb')
r = pickle.load(in_file)
in_file.close()
else:
# r = my_requests.get("https://api.unpaywall.org/" + str(cur_id) + "?email=" + UNPAYWALL_EMAIL) # force string
# r = my_requests.get(url, params={}, headers={})
#
# scopus api is not friendly so I need a try/except here
#
# wait-and-retry
one_shot = False
if one_shot:
retries = 0
try:
ab = overloaded_abstract_retrieval(identifier=cur_id, view='FULL', refresh=True, id_type='eid')
r = Mock(spec=Response)
try:
ab_abstract = ab.abstract
except:
# error in getting abstract out (outside API
ab_abstract = np.nan
r.json.return_value = {'text': ab_abstract, 'message': 'hi', 'retries':retries}
r.status_code = 999
# requirements:
# r.json().keys
# r.json()['message']
# r.json()['results'] # if not present, will not unpack and use json().keys()
except:
# if so, fall back to invalid routine
#
# get the invalid-doi-response directly from disk to save time, you can run update_api_statics to update it
in_file = open(static.PATH_STATIC_RESPONSES_SCOPUS_ABS, 'rb')
r = pickle.load(in_file)
in_file.close()
else:
# print(one_shot)
retry = True
retries = -1
while retry:
#retry = False # removes retries
retries = retries + 1
try:
ab = overloaded_abstract_retrieval(identifier=cur_id, view='FULL', refresh=True, id_type='eid')
qq = ab.title
qqx = qq + 'x'
#
# if api does not error, and wepuyc have an title, then the call is correct and we got info back successfully
#
# then do rest of actions
r = Mock(spec=Response)
try:
ab_abstract = ab.abstract
except:
# error in getting abstract out (outside API
ab_abstract = np.nan
r.json.return_value = {'text': ab_abstract, 'message': 'hi', 'retries': retries}
r.status_code = 999
retry = False
except:
# we had an api error or a return with empty information
# either way, just fillna and continue
if retries < 30:
retry = True
time.sleep(1)
if retries > 2:
print('retrying ' + str(retries))
else:
retry = False
# prepare for exit
in_file = open(static.PATH_STATIC_RESPONSES_SCOPUS_ABS, 'rb')
r = pickle.load(in_file)
in_file.close()
# you have to validate this code because scopus has weird features going in which mess up data when overloading
return r, relevant_keys, cur_id_lower, prepend, id_type
crystal_scopus_abstract2 = appender(func=crystal_scopus_abstract2, cur_id_name='eid')
class api_extractor:
"""
DEPRECATED: please stop using this... I will make a new one later, for now updates and patches are stopped
This class is an api extractor: it extracts info across api's.
Has multi-threading :)
Is not an eager operator so ScopusSearch query is only executed when needed and not on initialization
source_list: which sources to use, like unpaywall
query: query to put in scopussearch
Under construction: only does unpaywall data right now to test multi-threading
Also, I need an extra step for scopussearch datacleaning split-off
Dubbel-check ff of je de juiste funccorresponding_author_functionsties hebt, bv voor unpaywall drop_dupe stap bij merge
Plan nu: ff scopussearch-bypass erin, daarmee ff doortesten speedgain op grotere volumes
"""
def __init__(self,
query='TITLE(DATA SCIENCE) AND PUBDATETXT(February 2018)',
source_list=['all'],
max_num_workers=32):
self.source_list = source_list
self.query = query
self.scopus_search_info = None
self.scopus_search_info_ready = False
self.max_num_workers = max_num_workers
def get_scopus_search_info(self, cur_query):
"""
Gets the scopus search info and return it as dataframe of obj.results
Not yet handling errors of API...
"""
use_sleep_and_retry = True
if use_sleep_and_retry:
no_res = True
cntr=0
while no_res:
try:
res = pd.DataFrame(ScopusSearch(cur_query, refresh=True).results)
no_res = False
except:
cntr = cntr + 1
print(str(cntr) + ' ' + cur_query)
time.sleep(1)
else:
res = pd.DataFrame(ScopusSearch(cur_query, refresh=True).results)
return res
def feed_scopus_search_info(self, df_in, do_return=False, do_overwrite=False):
"""
This methods allows you to directly feed in a dataframe with scopussearch info,
of the form pandas.DataFrame(ScopusSearch().results)
"""