Skip to content

Commit

Permalink
adding socketio lib + fixing exceptions in pandas at geoloc routine
Browse files Browse the repository at this point in the history
  • Loading branch information
JulienParis committed Jan 31, 2019
1 parent 80ccfab commit 183be25
Show file tree
Hide file tree
Showing 7 changed files with 95 additions and 49 deletions.
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ Flask-JWT-Extended==3.15.0
Flask-Mail==0.9.1
Flask-PyMongo==2.2.0
flask-restplus==0.12.1
Flask-SocketIO==3.2.1
geographiclib==1.49
geopy==1.18.1
gunicorn==19.9.0
HeapDict==1.0.0
idna==2.8
itsdangerous==1.1.0
Expand All @@ -34,6 +36,8 @@ PyJWT==1.7.1
pymongo==3.7.2
python-dateutil==2.7.5
python-dotenv==0.10.1
python-engineio==3.3.0
python-socketio==3.1.2
pytz==2018.9
PyYAML==3.13
requests==2.21.0
Expand Down
19 changes: 12 additions & 7 deletions solidata_api/_core/pandas_ops/pd_concat_prj.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ def prj_dsi_mapping_as_df(prj_dsi_mapping) :
print( "- prj_dsi_mapping " + "-\- "*40)
log.debug("... run prj_dsi_mapping_as_df ...")

df_mapper_dsi_to_dmf = pd.DataFrame(prj_dsi_mapping)
dsi_mapped_list = list(set(df_mapper_dsi_to_dmf["oid_dsi"]))
df_mapper_dsi_to_dmf = df_mapper_dsi_to_dmf.set_index(["oid_dsi","oid_dmf"]).sort_index()
df_mapper_dsi_to_dmf = pd.DataFrame(prj_dsi_mapping)
dsi_mapped_list = list(set(df_mapper_dsi_to_dmf["oid_dsi"]))
df_mapper_dsi_to_dmf = df_mapper_dsi_to_dmf.set_index(["oid_dsi","oid_dmf"]).sort_index()
print()
log.debug("... df_mapper_dsi_to_dmf ...")
print(df_mapper_dsi_to_dmf)
Expand Down Expand Up @@ -133,13 +133,14 @@ def dsi_remap (dsi_data, df_mapper_dsi_to_dmf, df_mapper_col_headers ) :
df_ = pd.DataFrame(dsi_data["data_raw"]["f_data"])

### drop useless columns in df_
df_cols = list(df_.columns)
df_cols_to_drop = [ h for h in df_cols if h not in df_cols_to_keep ]
df_light = df_.drop( df_cols_to_drop, axis=1 )
df_cols = list(df_.columns)
df_cols_to_drop = [ h for h in df_cols if h not in df_cols_to_keep ]
df_light = df_.drop( df_cols_to_drop, axis=1 )
# print()
log.debug("... df_light (after columns drop) ...")
# print(df_light.head(3))


### convert Nan to None
df_light = df_light.replace({np.nan:None})
print()
Expand All @@ -154,8 +155,12 @@ def dsi_remap (dsi_data, df_mapper_dsi_to_dmf, df_mapper_col_headers ) :
print(df_light.dtypes)
print(df_light.head(3))

### convert NaT to None
df_light = df_light.replace({pd.NaT:None})
print()

# ### rename columns dataframe
remapper_dict = dict(df_map_['f_title'])
remapper_dict = dict(df_map_['f_title'])
df_light.columns = df_light.columns.to_series().map(remapper_dict)
# print()
log.debug("... df_light (after renaming) ...")
Expand Down
3 changes: 2 additions & 1 deletion solidata_api/_core/queries_db/query_build_dso.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ def Query_db_build_dso (
else :
dso_f_data = []
log.debug("... dso_f_data is composed ...")
log.debug("... dso_f_data : \n%s", pformat(dso_f_data[:5]))
log.debug("... dso_f_data[:5] : \n%s", pformat(dso_f_data[:5]))

### copy f_data to dso_in
dso_in["data_raw"]["f_data"] = dso_f_data
Expand Down Expand Up @@ -284,6 +284,7 @@ def Query_db_build_dso (
value_test_f_data = dso_in["data_raw"]["f_data"][0]
log.debug( "value_test_f_data : \n%s", pformat(value_test_f_data) )
for k, v in value_test_f_data.items() :
print()
log.debug("type(k) : %s", type(k))
log.debug("type(v) : %s", type(v))
log.debug("... preparing to replace / insert dso_in ...")
Expand Down
19 changes: 11 additions & 8 deletions solidata_api/_core/queries_db/query_delete.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,15 @@ def Query_db_delete (
# marshaller = Marshaller(ns, models)

### default values
db_collection = db_dict_by_type[document_type]
document_type_full = doc_type_dict[document_type]
user_id = user_oid = None
user_role = "anonymous"
document_out = None
response_code = 401
db_collection = db_dict_by_type[document_type]
document_type_full = doc_type_dict[document_type]
user_id = user_oid = None
user_role = "anonymous"
doc_oid = ObjectId(doc_id)
document_out = None
response_code = 401
user_allowed_to_delete = False
message = "dear user, you don't have the credentials to delete this {} with this oid : {}".format(document_type_full, doc_id)
message = "dear user, you don't have the credentials to delete this {} with this oid : {}".format(document_type_full, doc_id)

if claims or claims!={} :
user_role = claims["auth"]["role"]
Expand Down Expand Up @@ -91,9 +92,11 @@ def Query_db_delete (

if user_allowed_to_delete :
### delete doc from db
db_collection.delete_one({"_id" : ObjectId(doc_id) })
db_collection.delete_one({"_id" : doc_oid })

### TO DO - delete user info from all projects and other datasets


### TO DO - OR choice to keep at least email / or / delete all data

message = "dear user, you just deleted the following %s with oid : %s" %(document_type_full, doc_id)
Expand Down
4 changes: 2 additions & 2 deletions solidata_api/_core/queries_db/query_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def Query_db_list (

### sum up all query arguments
query_resume = {
"document_type" : document_type,
"document_type" : document_type,
"user_id" : user_id,
"user_role" : user_role,
"page_args" : page_args,
Expand All @@ -77,7 +77,7 @@ def Query_db_list (
### get pagination arguments
log.debug('page_args : \n%s', pformat(page_args) )
page = page_args.get('page', 1 )
per_page = page_args.get('per_page', 10 )
per_page = page_args.get('per_page', 10 )
if page != 1 :
start_index = ( page - 1 ) * per_page
end_index = start_index + per_page
Expand Down
93 changes: 63 additions & 30 deletions solidata_api/_core/solidify/geoloc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,11 @@ def bar():
### GENERIC DEFAULT VARIABLES
### - - - - - - - - - - - - - - ###

dft_delay = 0.5
dft_timeout = 5
full_address_col = "temp_solidata_full_address_"
dft_delay = 0.5
dft_timeout = 5
full_address_col = "temp_solidata_full_address_"
location_compl_col = "temp_solidata_compl_location_"
location_col = "temp_solidata_location_"
location_col = "temp_solidata_location_"

empty_location = {
"src_geocoder" : None,
Expand Down Expand Up @@ -135,6 +135,21 @@ def info():
### GENERIC GEOLOC FUNCTIONS
### - - - - - - - - - - - - - - ###

def stringifyValue(raw_address) :
"""
convert raw address value to string
"""
address = raw_address

# print()
# log.debug('address : %s', address)
# log.debug('type(address) is listtype(address) is list : %s', type(address) is list)

if type(address) is list :
address = ", ".join(raw_address)
return address


def LocToDict(location_raw, src_geocoder=None) :
"""
location formater
Expand Down Expand Up @@ -184,46 +199,51 @@ def concat_cols(row, columns_to_concat):

### TO DO : prevent error 429 (too many requests) by using RateLimiter

def geocode_with_Nominatim( adress,
time_out = dft_timeout,
) :
def geocode_with_Nominatim( address,
time_out = dft_timeout,
) :
geocoder_nom = Nominatim(user_agent="solidata_app_to_Nominatim")
log.debug("- geocode_with_Nominatim - ")
try:
loc = geocoder_nom.geocode(
query=adress,
query=address,
timeout=time_out,
# exactly_one=True,
extratags=True
)
log.debug("- loc : \n%s ", loc)
return loc
except GeocoderTimedOut:
return geocode_with_Nominatim(adress)
return geocode_with_Nominatim(address)
except :
pass

def geocode_with_Ban( adress,
time_out = dft_timeout,
) :
def geocode_with_Ban( address,
time_out = dft_timeout,
) :
geocoder_ban = BANFrance(user_agent="solidata_app_to_BAN")
log.debug("- geocode_with_Ban - ")
try:
loc = geocoder_ban.geocode(
query=adress,
query=address,
timeout=time_out,
# exactly_one=True,
)
log.debug("- loc : \n%s ", loc)
return loc
except GeocoderTimedOut:
return geocode_with_Ban(adress)
return geocode_with_Ban(address)
except :
pass

### main geolocalizing function for dataframe
### NOTE : try to slice dataframe by 100 rows
### + update doc after each slice so to show progress to user
def geoloc_df_col(
row_val,
time_out = dft_timeout,
delay = dft_delay,
complement = "",
time_out = dft_timeout,
delay = dft_delay,
apply_sleep = False
) :

Expand All @@ -247,18 +267,19 @@ def geoloc_df_col(
location_raw = None

### add address complement to full_address_col value (in case it helps)
adress = row_val
address = row_val

# if complement != "" :
# if row_val != "" :
# adress = ", ".join( [ row_val, complement ] )
# address = ", ".join( [ row_val, complement ] )
# else :
# adress = complement
# address = complement

log.debug("- adress : %s", adress)
log.debug("- address : %s", address)

# if pd.isna(row_val) == False :
# if pd.notnull(row_val) :
if adress != "" :
if address != "" :

print()

Expand All @@ -269,17 +290,23 @@ def geoloc_df_col(
### run geocoders
try :
### test with nominatim first
log.debug("- try Nominatim - ")
log.debug("- try Nominatim (1) - ")
src_geocoder = "nominatim"
location_raw = geocode_with_Nominatim(adress, time_out=time_out)
location_raw = geocode_with_Nominatim(address, time_out=time_out)

# log.debug("- type(location_raw) : %s ", type(location_raw))

if location_raw == None :
### test just with BAN then
log.debug("- try BAN (1) - ")
src_geocoder = "BAN"
location_raw = geocode_with_Ban(adress, time_out=time_out)
location_raw = geocode_with_Ban(address, time_out=time_out)

if location_raw == None and complement != "" :
### test just with Nominatm then
log.debug("- try Nominatim (2) - ")
src_geocoder = "nominatim"
location_raw = geocode_with_Nominatim(complement, time_out=time_out)

except ValueError as error_message :
log.error("ValueError : %s", ValueError)
Expand Down Expand Up @@ -367,7 +394,12 @@ def geoloc_dsi ( dsi_doc,
df_f_data = df_f_data.fillna(value="")
# df_f_data = df_f_data.replace({np.nan:None})


### before concatenating columns convert lists to strings
log.debug("... df_f_data[ cols_to_concat ].head(3) - before converting lists to strings : %s", df_f_data[ cols_to_concat ].head(3) )
for col in cols_to_concat :
df_f_data[ col ] = df_f_data[ col ].apply(lambda x: ', '.join(str(s) for s in x) )
log.debug("... df_f_data[ cols_to_concat ].head(3) - after converting lists to strings : %s", df_f_data[ cols_to_concat ].head(3) )

''' apply concat function to each row (axis=1) --> alternative LESS pythonic
### change type of every target column --> string
df_f_data[cols_to_concat] = df_f_data[cols_to_concat].astype(str)
Expand All @@ -385,7 +417,7 @@ def geoloc_dsi ( dsi_doc,
df_f_data[location_compl_col] = params["address_complement"]

### merge the columns
df_f_data[full_address_col] = df_f_data[[full_address_col, location_compl_col]].apply(lambda x: ' '.join(x), axis=1)
df_f_data[full_address_col] = df_f_data[[full_address_col, location_compl_col]].apply(lambda x : ' '.join(x), axis=1)
df_f_data = df_f_data.drop([location_compl_col], axis=1)

log.debug("... df_f_data.shape - before test check : %s", df_f_data.shape )
Expand Down Expand Up @@ -420,16 +452,17 @@ def geoloc_dsi ( dsi_doc,
df_f_data[ location_col ] = df_f_data[ full_address_col ].swifter.apply(
geoloc_df_col,
complement = params["address_complement"],
time_out = params["timeout"],
delay = params["delay"],
time_out = params["timeout"],
delay = params["delay"],
apply_sleep = apply_sleep
)
else :
### without swifter
df_f_data[ location_col ] = df_f_data[ full_address_col ].apply(
geoloc_df_col,
time_out = params["timeout"],
delay = params["delay"],
complement = params["address_complement"],
time_out = params["timeout"],
delay = params["delay"],
apply_sleep = apply_sleep
)
# df_f_data[ location_col ] = df_f_data[ full_address_col ].apply(
Expand Down
2 changes: 1 addition & 1 deletion solidata_api/config_default_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@
"data_raw" : {
"f_code" : "WEBSITE",
"f_object" : "",
"f_type" : "media_link",
"f_type" : "hyperlink",
"f_comments" : "",
"f_is_required" : False,
}
Expand Down

0 comments on commit 183be25

Please sign in to comment.