Skip to content

Commit 03763c5

Browse files
fix errors from abcd conversion practice (#94)
* fix errors from abcd conversion practice * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 5c1f898 commit 03763c5

File tree

1 file changed

+126
-43
lines changed

1 file changed

+126
-43
lines changed

reproschema/redcap2reproschema.py

Lines changed: 126 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
import re
23
from pathlib import Path
34
from typing import Any, Dict
@@ -23,20 +24,24 @@
2324
)
2425

2526

26-
def process_input_value_types(input_type_rc, value_type_rc) -> (str, str):
27+
def process_input_value_types(
28+
input_type_rc, value_type_rc
29+
) -> (str, str, dict):
2730
"""
28-
Process input type and value type to determine the final input type and value type,
29-
that can be used by ReproSchema.
31+
Process input type and value type to determine the final input type and value type.
3032
3133
Args:
3234
input_type_rc (str): Input type from redcap form
3335
value_type_rc (str): Value type from redcap form
3436
3537
Returns:
36-
tuple: (input_type, value_type)
38+
tuple: (input_type, value_type, additional_notes)
3739
input_type (str): Final input type for ReproSchema
3840
value_type (str): Final value type for ReproSchema
41+
additional_notes (dict): Additional notes about custom types, or None
3942
"""
43+
additional_notes = None
44+
4045
# If input type in redcap is set but not recognized, raise an error
4146
if input_type_rc not in INPUT_TYPE_MAP:
4247
raise ValueError(
@@ -47,8 +52,20 @@ def process_input_value_types(input_type_rc, value_type_rc) -> (str, str):
4752
input_type = INPUT_TYPE_MAP.get(input_type_rc)
4853

4954
if value_type_rc:
50-
# Get value type using the new function
51-
value_type = get_value_type(value_type_rc)
55+
try:
56+
# Try to get standard value type
57+
value_type = get_value_type(value_type_rc)
58+
except ValueError:
59+
# If it fails, it's an unknown validation type
60+
print(
61+
f"Warning: Unrecognized validation type '{value_type_rc}', treating as string"
62+
)
63+
value_type = "xsd:string"
64+
additional_notes = {
65+
"source": "redcap",
66+
"column": "Text Validation Type OR Show Slider Number",
67+
"value": value_type_rc,
68+
}
5269

5370
# Adjust input type based on validation
5471
if value_type == "xsd:date" and input_type_rc == "text":
@@ -71,7 +88,7 @@ def process_input_value_types(input_type_rc, value_type_rc) -> (str, str):
7188
else: # if no validation type is set, default to string
7289
value_type = "xsd:string"
7390

74-
return input_type, value_type
91+
return input_type, value_type, additional_notes
7592

7693

7794
def process_response_options(row, input_type_rc, value_type) -> Dict[str, Any]:
@@ -83,13 +100,14 @@ def process_response_options(row, input_type_rc, value_type) -> Dict[str, Any]:
83100
input_type_rc (str): Input type from redcap form
84101
value_type (str): ReproSchema value type
85102
Returns:
86-
dict: Response options
103+
dict: Response options and additional notes if any
87104
"""
88105
input_type = INPUT_TYPE_MAP[input_type_rc]
89106
# Default response options
90107
response_options = {"valueType": [value_type]}
108+
additional_notes = None
91109

92-
# Handle specific input_type_rc that modify other properties (not only inputType and valueType)
110+
# Handle specific input_type_rc that modify other properties
93111
if input_type_rc == "yesno":
94112
response_options["choices"] = [
95113
{"name": {"en": "Yes"}, "value": 1},
@@ -104,17 +122,34 @@ def process_response_options(row, input_type_rc, value_type) -> Dict[str, Any]:
104122
response_options["multipleChoice"] = True
105123

106124
if row.get("choices") and input_type:
107-
if input_type in ["radio", "select", "slider", "text"]:
125+
# We're checking input_type (ReproSchema type) here
126+
if input_type in ["radio", "select", "slider", "text", "static"]:
108127
choices, choices_val_type_l = process_choices(
109128
row.get("choices"), item_name=row["item_name"]
110129
)
111130
if choices:
112-
response_options.update(
113-
{
114-
"choices": choices,
115-
"valueType": choices_val_type_l,
131+
# We're checking input_type_rc (REDCap type) here
132+
if input_type_rc == "descriptive":
133+
print(
134+
f"Info: Preserving choices for descriptive field {row['item_name']}"
135+
)
136+
# Store as additional notes instead of in response options
137+
# Serialize the choices to a string to comply with additionalNotesObj model
138+
additional_notes = {
139+
"source": "redcap",
140+
"column": "Choices, Calculations, OR Slider Labels (Descriptive Field)",
141+
"value": json.dumps(
142+
choices
143+
), # Convert choices to a JSON string
116144
}
117-
)
145+
else:
146+
# For normal input types, process choices normally
147+
response_options.update(
148+
{
149+
"choices": choices,
150+
"valueType": choices_val_type_l,
151+
}
152+
)
118153
if input_type == "slider":
119154
response_options.update(
120155
{
@@ -126,33 +161,39 @@ def process_response_options(row, input_type_rc, value_type) -> Dict[str, Any]:
126161
pass # taken care below, it's not really choices
127162
else:
128163
print(
129-
f"Warning/Error: Unexpected input type for choices in {row['item_name']}: input type {input_type} "
164+
f"Warning: Unexpected input type for choices in {row['item_name']}: input type {input_type} "
130165
f"(original in redcap: {input_type_rc}), values: {row.get('choices')}"
131166
)
132-
# raise ValueError(
133-
# f"Unexpected input type '{input_type}' (original in redcap: {input_type_rc}) "
134-
# f"for item with choices in {item['item_name']}"
135-
# )
136167

137168
for key in RESPONSE_COND:
138169
if row.get(key) is not None and str(row.get(key)).strip():
170+
# Min/max validations only apply to numeric types
171+
if value_type not in ["xsd:integer", "xsd:decimal"]:
172+
print(
173+
f"Warning: {key} is not supported for non-numeric type {value_type}. Skipping."
174+
)
175+
continue
176+
139177
try:
140-
if value_type == "xsd:integer":
141-
parsed_value = int(row[key])
142-
elif value_type == "xsd:decimal":
143-
parsed_value = float(row[key])
178+
# Parse as float first to handle any numeric format
179+
raw_value = float(row[key])
180+
181+
# If it's a whole number, store as integer for cleaner JSON
182+
# Otherwise, keep as float
183+
if raw_value.is_integer():
184+
parsed_value = int(raw_value)
144185
else:
145-
print(
146-
f"Warning: {key} is not supported for value types other than integer or decimal, value type provided is {value_type}"
147-
)
148-
continue
186+
parsed_value = raw_value
187+
149188
response_options[key] = parsed_value
189+
150190
except ValueError:
151191
print(
152-
f"Warning/Error: Value {row[key]} is not a valid {value_type}"
192+
f"Warning: Value '{row[key]}' for {key} is not a valid number"
153193
)
154194
continue
155-
return response_options
195+
196+
return response_options, additional_notes
156197

157198

158199
def process_choices(choices_str, item_name):
@@ -314,7 +355,7 @@ def process_row(
314355
if not input_type_rc:
315356
input_type_rc = "text"
316357

317-
input_type, value_type = process_input_value_types(
358+
input_type, value_type, input_value_notes = process_input_value_types(
318359
input_type_rc, value_type_rc
319360
)
320361
item_data = {
@@ -325,9 +366,10 @@ def process_row(
325366
"ui": {"inputType": input_type},
326367
}
327368

328-
item_data["responseOptions"] = process_response_options(
369+
response_options, choices_notes = process_response_options(
329370
row, input_type_rc, value_type
330371
)
372+
item_data["responseOptions"] = response_options
331373

332374
# setting readonly to true based on annotation and field type
333375
if row.get("annotation"):
@@ -412,20 +454,60 @@ def process_row(
412454
elif row.get("visibility"):
413455
addProperties["isVis"] = normalize_condition(row.get("visibility"))
414456

457+
# Add custom validation type note and choices notes if present
458+
if input_value_notes:
459+
item_data.setdefault("additionalNotesObj", []).append(
460+
input_value_notes
461+
)
462+
if choices_notes:
463+
item_data.setdefault("additionalNotesObj", []).append(choices_notes)
464+
415465
return item_data, preamble_info_propagate, compute, addProperties
416466

417467

418-
def process_csv(csv_file) -> (Dict[str, Any], list):
468+
def process_csv(csv_file, encoding=None) -> (Dict[str, Any], list):
469+
"""
470+
Process a REDCap CSV file and extract structured data for items and activities.
471+
472+
Args:
473+
csv_file: Path to the REDCap CSV file
474+
encoding (str, optional): Specific encoding to use for the CSV file
475+
476+
Returns:
477+
tuple: (activities, protocol_activities_order)
478+
activities: Dictionary containing activity data
479+
protocol_activities_order: List of activity names in order
480+
"""
481+
if encoding:
482+
try:
483+
df = pd.read_csv(csv_file, encoding=encoding, low_memory=False)
484+
print(f"Using specified encoding: {encoding}")
485+
except UnicodeDecodeError:
486+
raise ValueError(
487+
f"Failed to read CSV with specified encoding: {encoding}"
488+
)
489+
else:
490+
# Try multiple encodings in order
491+
encodings = ["utf-8-sig", "latin-1", "windows-1252", "cp1252"]
492+
df = None
419493

420-
df = pd.read_csv(
421-
csv_file, encoding="utf-8-sig", low_memory=False
422-
) # utf-8-sig handles BOM automatically
494+
for encoding in encodings:
495+
try:
496+
df = pd.read_csv(csv_file, encoding=encoding, low_memory=False)
497+
print(f"Successfully read CSV with {encoding} encoding")
498+
break
499+
except UnicodeDecodeError:
500+
print(
501+
f"Failed to decode with {encoding}, trying next encoding..."
502+
)
423503

424-
df.columns = df.columns.map(
425-
lambda x: x.strip().strip('"')
426-
) # some cleaning might not be needed
504+
if df is None:
505+
raise ValueError(
506+
"Failed to read CSV file with any of the attempted encodings. "
507+
"Please check the file encoding or convert it to UTF-8."
508+
)
427509

428-
# Clean NaNs values in the dataframe
510+
df.columns = df.columns.map(lambda x: x.strip().strip('"'))
429511
df = df.astype(str).replace("nan", "")
430512

431513
# Validate required columns
@@ -496,7 +578,7 @@ def process_csv(csv_file) -> (Dict[str, Any], list):
496578

497579

498580
def redcap2reproschema(
499-
csv_file, yaml_file, output_path, schema_context_url=None
581+
csv_file, yaml_file, output_path, schema_context_url=None, encoding=None
500582
):
501583
"""
502584
Convert a REDCap data dictionary to Reproschema format.
@@ -506,6 +588,7 @@ def redcap2reproschema(
506588
yaml_file (str/Path): Path to the YAML configuration file
507589
output_path (str/Path): Path to the output directory
508590
schema_context_url (str, optional): URL for the schema context
591+
encoding (str, optional): Specific encoding to use for the CSV file
509592
510593
Raises:
511594
ValueError: If required files are missing or invalid
@@ -533,8 +616,8 @@ def redcap2reproschema(
533616
if schema_context_url is None:
534617
schema_context_url = CONTEXTFILE_URL
535618

536-
# Process the CSV file and getting information about the activities and items
537-
activities, prot_activities_order = process_csv(csv_path)
619+
# Process the CSV file with the specified encoding
620+
activities, prot_activities_order = process_csv(csv_path, encoding)
538621

539622
for activity_name, activity_data in activities.items():
540623
create_activity_schema(

0 commit comments

Comments
 (0)