1+ import json
12import re
23from pathlib import Path
34from typing import Any , Dict
2324)
2425
2526
26- def process_input_value_types (input_type_rc , value_type_rc ) -> (str , str ):
27+ def process_input_value_types (
28+ input_type_rc , value_type_rc
29+ ) -> (str , str , dict ):
2730 """
28- Process input type and value type to determine the final input type and value type,
29- that can be used by ReproSchema.
31+ Process input type and value type to determine the final input type and value type.
3032
3133 Args:
3234 input_type_rc (str): Input type from redcap form
3335 value_type_rc (str): Value type from redcap form
3436
3537 Returns:
36- tuple: (input_type, value_type)
38+ tuple: (input_type, value_type, additional_notes )
3739 input_type (str): Final input type for ReproSchema
3840 value_type (str): Final value type for ReproSchema
41+ additional_notes (dict): Additional notes about custom types, or None
3942 """
43+ additional_notes = None
44+
4045 # If input type in redcap is set but not recognized, raise an error
4146 if input_type_rc not in INPUT_TYPE_MAP :
4247 raise ValueError (
@@ -47,8 +52,20 @@ def process_input_value_types(input_type_rc, value_type_rc) -> (str, str):
4752 input_type = INPUT_TYPE_MAP .get (input_type_rc )
4853
4954 if value_type_rc :
50- # Get value type using the new function
51- value_type = get_value_type (value_type_rc )
55+ try :
56+ # Try to get standard value type
57+ value_type = get_value_type (value_type_rc )
58+ except ValueError :
59+ # If it fails, it's an unknown validation type
60+ print (
61+ f"Warning: Unrecognized validation type '{ value_type_rc } ', treating as string"
62+ )
63+ value_type = "xsd:string"
64+ additional_notes = {
65+ "source" : "redcap" ,
66+ "column" : "Text Validation Type OR Show Slider Number" ,
67+ "value" : value_type_rc ,
68+ }
5269
5370 # Adjust input type based on validation
5471 if value_type == "xsd:date" and input_type_rc == "text" :
@@ -71,7 +88,7 @@ def process_input_value_types(input_type_rc, value_type_rc) -> (str, str):
7188 else : # if no validation type is set, default to string
7289 value_type = "xsd:string"
7390
74- return input_type , value_type
91+ return input_type , value_type , additional_notes
7592
7693
7794def process_response_options (row , input_type_rc , value_type ) -> Dict [str , Any ]:
@@ -83,13 +100,14 @@ def process_response_options(row, input_type_rc, value_type) -> Dict[str, Any]:
83100 input_type_rc (str): Input type from redcap form
84101 value_type (str): ReproSchema value type
85102 Returns:
86- dict: Response options
103+ dict: Response options and additional notes if any
87104 """
88105 input_type = INPUT_TYPE_MAP [input_type_rc ]
89106 # Default response options
90107 response_options = {"valueType" : [value_type ]}
108+ additional_notes = None
91109
92- # Handle specific input_type_rc that modify other properties (not only inputType and valueType)
110+ # Handle specific input_type_rc that modify other properties
93111 if input_type_rc == "yesno" :
94112 response_options ["choices" ] = [
95113 {"name" : {"en" : "Yes" }, "value" : 1 },
@@ -104,17 +122,34 @@ def process_response_options(row, input_type_rc, value_type) -> Dict[str, Any]:
104122 response_options ["multipleChoice" ] = True
105123
106124 if row .get ("choices" ) and input_type :
107- if input_type in ["radio" , "select" , "slider" , "text" ]:
125+ # We're checking input_type (ReproSchema type) here
126+ if input_type in ["radio" , "select" , "slider" , "text" , "static" ]:
108127 choices , choices_val_type_l = process_choices (
109128 row .get ("choices" ), item_name = row ["item_name" ]
110129 )
111130 if choices :
112- response_options .update (
113- {
114- "choices" : choices ,
115- "valueType" : choices_val_type_l ,
131+ # We're checking input_type_rc (REDCap type) here
132+ if input_type_rc == "descriptive" :
133+ print (
134+ f"Info: Preserving choices for descriptive field { row ['item_name' ]} "
135+ )
136+ # Store as additional notes instead of in response options
137+ # Serialize the choices to a string to comply with additionalNotesObj model
138+ additional_notes = {
139+ "source" : "redcap" ,
140+ "column" : "Choices, Calculations, OR Slider Labels (Descriptive Field)" ,
141+ "value" : json .dumps (
142+ choices
143+ ), # Convert choices to a JSON string
116144 }
117- )
145+ else :
146+ # For normal input types, process choices normally
147+ response_options .update (
148+ {
149+ "choices" : choices ,
150+ "valueType" : choices_val_type_l ,
151+ }
152+ )
118153 if input_type == "slider" :
119154 response_options .update (
120155 {
@@ -126,33 +161,39 @@ def process_response_options(row, input_type_rc, value_type) -> Dict[str, Any]:
126161 pass # taken care below, it's not really choices
127162 else :
128163 print (
129- f"Warning/Error : Unexpected input type for choices in { row ['item_name' ]} : input type { input_type } "
164+ f"Warning: Unexpected input type for choices in { row ['item_name' ]} : input type { input_type } "
130165 f"(original in redcap: { input_type_rc } ), values: { row .get ('choices' )} "
131166 )
132- # raise ValueError(
133- # f"Unexpected input type '{input_type}' (original in redcap: {input_type_rc}) "
134- # f"for item with choices in {item['item_name']}"
135- # )
136167
137168 for key in RESPONSE_COND :
138169 if row .get (key ) is not None and str (row .get (key )).strip ():
170+ # Min/max validations only apply to numeric types
171+ if value_type not in ["xsd:integer" , "xsd:decimal" ]:
172+ print (
173+ f"Warning: { key } is not supported for non-numeric type { value_type } . Skipping."
174+ )
175+ continue
176+
139177 try :
140- if value_type == "xsd:integer" :
141- parsed_value = int (row [key ])
142- elif value_type == "xsd:decimal" :
143- parsed_value = float (row [key ])
178+ # Parse as float first to handle any numeric format
179+ raw_value = float (row [key ])
180+
181+ # If it's a whole number, store as integer for cleaner JSON
182+ # Otherwise, keep as float
183+ if raw_value .is_integer ():
184+ parsed_value = int (raw_value )
144185 else :
145- print (
146- f"Warning: { key } is not supported for value types other than integer or decimal, value type provided is { value_type } "
147- )
148- continue
186+ parsed_value = raw_value
187+
149188 response_options [key ] = parsed_value
189+
150190 except ValueError :
151191 print (
152- f"Warning/Error : Value { row [key ]} is not a valid { value_type } "
192+ f"Warning: Value ' { row [key ]} ' for { key } is not a valid number "
153193 )
154194 continue
155- return response_options
195+
196+ return response_options , additional_notes
156197
157198
158199def process_choices (choices_str , item_name ):
@@ -314,7 +355,7 @@ def process_row(
314355 if not input_type_rc :
315356 input_type_rc = "text"
316357
317- input_type , value_type = process_input_value_types (
358+ input_type , value_type , input_value_notes = process_input_value_types (
318359 input_type_rc , value_type_rc
319360 )
320361 item_data = {
@@ -325,9 +366,10 @@ def process_row(
325366 "ui" : {"inputType" : input_type },
326367 }
327368
328- item_data [ "responseOptions" ] = process_response_options (
369+ response_options , choices_notes = process_response_options (
329370 row , input_type_rc , value_type
330371 )
372+ item_data ["responseOptions" ] = response_options
331373
332374 # setting readonly to true based on annotation and field type
333375 if row .get ("annotation" ):
@@ -412,20 +454,60 @@ def process_row(
412454 elif row .get ("visibility" ):
413455 addProperties ["isVis" ] = normalize_condition (row .get ("visibility" ))
414456
457+ # Add custom validation type note and choices notes if present
458+ if input_value_notes :
459+ item_data .setdefault ("additionalNotesObj" , []).append (
460+ input_value_notes
461+ )
462+ if choices_notes :
463+ item_data .setdefault ("additionalNotesObj" , []).append (choices_notes )
464+
415465 return item_data , preamble_info_propagate , compute , addProperties
416466
417467
418- def process_csv (csv_file ) -> (Dict [str , Any ], list ):
468+ def process_csv (csv_file , encoding = None ) -> (Dict [str , Any ], list ):
469+ """
470+ Process a REDCap CSV file and extract structured data for items and activities.
471+
472+ Args:
473+ csv_file: Path to the REDCap CSV file
474+ encoding (str, optional): Specific encoding to use for the CSV file
475+
476+ Returns:
477+ tuple: (activities, protocol_activities_order)
478+ activities: Dictionary containing activity data
479+ protocol_activities_order: List of activity names in order
480+ """
481+ if encoding :
482+ try :
483+ df = pd .read_csv (csv_file , encoding = encoding , low_memory = False )
484+ print (f"Using specified encoding: { encoding } " )
485+ except UnicodeDecodeError :
486+ raise ValueError (
487+ f"Failed to read CSV with specified encoding: { encoding } "
488+ )
489+ else :
490+ # Try multiple encodings in order
491+ encodings = ["utf-8-sig" , "latin-1" , "windows-1252" , "cp1252" ]
492+ df = None
419493
420- df = pd .read_csv (
421- csv_file , encoding = "utf-8-sig" , low_memory = False
422- ) # utf-8-sig handles BOM automatically
494+ for encoding in encodings :
495+ try :
496+ df = pd .read_csv (csv_file , encoding = encoding , low_memory = False )
497+ print (f"Successfully read CSV with { encoding } encoding" )
498+ break
499+ except UnicodeDecodeError :
500+ print (
501+ f"Failed to decode with { encoding } , trying next encoding..."
502+ )
423503
424- df .columns = df .columns .map (
425- lambda x : x .strip ().strip ('"' )
426- ) # some cleaning might not be needed
504+ if df is None :
505+ raise ValueError (
506+ "Failed to read CSV file with any of the attempted encodings. "
507+ "Please check the file encoding or convert it to UTF-8."
508+ )
427509
428- # Clean NaNs values in the dataframe
510+ df . columns = df . columns . map ( lambda x : x . strip (). strip ( '"' ))
429511 df = df .astype (str ).replace ("nan" , "" )
430512
431513 # Validate required columns
@@ -496,7 +578,7 @@ def process_csv(csv_file) -> (Dict[str, Any], list):
496578
497579
498580def redcap2reproschema (
499- csv_file , yaml_file , output_path , schema_context_url = None
581+ csv_file , yaml_file , output_path , schema_context_url = None , encoding = None
500582):
501583 """
502584 Convert a REDCap data dictionary to Reproschema format.
@@ -506,6 +588,7 @@ def redcap2reproschema(
506588 yaml_file (str/Path): Path to the YAML configuration file
507589 output_path (str/Path): Path to the output directory
508590 schema_context_url (str, optional): URL for the schema context
591+ encoding (str, optional): Specific encoding to use for the CSV file
509592
510593 Raises:
511594 ValueError: If required files are missing or invalid
@@ -533,8 +616,8 @@ def redcap2reproschema(
533616 if schema_context_url is None :
534617 schema_context_url = CONTEXTFILE_URL
535618
536- # Process the CSV file and getting information about the activities and items
537- activities , prot_activities_order = process_csv (csv_path )
619+ # Process the CSV file with the specified encoding
620+ activities , prot_activities_order = process_csv (csv_path , encoding )
538621
539622 for activity_name , activity_data in activities .items ():
540623 create_activity_schema (
0 commit comments