Skip to content

Commit 4a00307

Browse files
authored
Merge branch 'main' into saumya/upgrade-odbc
2 parents 1a7b4b8 + 80ce70f commit 4a00307

18 files changed

Lines changed: 2299 additions & 1540 deletions

eng/pipelines/pr-validation-pipeline.yml

Lines changed: 223 additions & 219 deletions
Large diffs are not rendered by default.

mssql_python/connection.py

Lines changed: 88 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
InternalError,
3939
ProgrammingError,
4040
NotSupportedError,
41+
sqlstate_to_exception,
4142
)
4243
from mssql_python.auth import extract_auth_type, process_connection_string
4344
from mssql_python.constants import ConstantsDDBC, GetInfoConstants
@@ -57,6 +58,42 @@
5758
# Note: "utf-16" with BOM is NOT included as it's problematic for SQL_WCHAR
5859
UTF16_ENCODINGS: frozenset[str] = frozenset(["utf-16le", "utf-16be"])
5960

61+
_SQLSTATE_RE = re.compile(r"^SQLSTATE:([A-Z0-9]{0,5}):(.*)", re.DOTALL)
62+
63+
64+
def _raise_connection_error(e: RuntimeError) -> None:
65+
"""Map a RuntimeError from the C++ pybind layer to the correct DB-API 2.0 exception.
66+
67+
Connection::checkError() throws "SQLSTATE:XXXXX:<odbc_message>" so the SQLSTATE
68+
can be mapped via sqlstate_to_exception(), consistent with cursor-level error handling.
69+
"""
70+
error_msg = str(e)
71+
match = _SQLSTATE_RE.match(error_msg)
72+
if match:
73+
sqlstate, ddbc_error = match.group(1), match.group(2)
74+
# Handle malformed SQLSTATE prefix (empty or invalid code)
75+
if not sqlstate or len(sqlstate) != 5:
76+
logger.error("Connection error (malformed SQLSTATE): %s", ddbc_error)
77+
raise OperationalError(
78+
driver_error="Connection operation failed",
79+
ddbc_error=ddbc_error,
80+
) from None
81+
exc = sqlstate_to_exception(sqlstate, ddbc_error)
82+
if exc is None:
83+
logger.error("Unknown SQLSTATE %s, raising DatabaseError", sqlstate)
84+
raise DatabaseError(
85+
driver_error=f"An error occurred with SQLSTATE code: {sqlstate}",
86+
ddbc_error=ddbc_error,
87+
) from None
88+
logger.error("Connection error (SQLSTATE %s): %s", sqlstate, ddbc_error)
89+
raise exc from None
90+
# Fallback: no SQLSTATE prefix — e.g. "Connection handle not allocated"
91+
logger.error("Connection error: %s", error_msg)
92+
raise OperationalError(
93+
driver_error="Connection operation failed",
94+
ddbc_error=error_msg,
95+
) from None
96+
6097

6198
def _validate_utf16_wchar_compatibility(
6299
encoding: str, wchar_type: int, context: str = "SQL_WCHAR"
@@ -261,10 +298,14 @@ def __init__(
261298
}
262299

263300
# Initialize decoding settings with Python 3 defaults
301+
# SQL_CHAR default uses SQL_WCHAR ctype so the ODBC driver returns
302+
# UTF-16 data for VARCHAR columns. This avoids encoding mismatches on
303+
# Windows where the driver returns raw bytes in the server's native
304+
# code page (e.g. CP-1252) that may fail to decode as UTF-8.
264305
self._decoding_settings = {
265306
ConstantsDDBC.SQL_CHAR.value: {
266-
"encoding": "utf-8",
267-
"ctype": ConstantsDDBC.SQL_CHAR.value,
307+
"encoding": "utf-16le",
308+
"ctype": ConstantsDDBC.SQL_WCHAR.value,
268309
},
269310
ConstantsDDBC.SQL_WCHAR.value: {
270311
"encoding": "utf-16le",
@@ -329,9 +370,12 @@ def __init__(
329370
if not PoolingManager.is_initialized():
330371
PoolingManager.enable()
331372
self._pooling = PoolingManager.is_enabled()
332-
self._conn = ddbc_bindings.Connection(
333-
self.connection_str, self._pooling, self._attrs_before
334-
)
373+
try:
374+
self._conn = ddbc_bindings.Connection(
375+
self.connection_str, self._pooling, self._attrs_before
376+
)
377+
except RuntimeError as e:
378+
_raise_connection_error(e)
335379
self.setautocommit(autocommit)
336380

337381
# Register this connection for cleanup before Python shutdown
@@ -452,7 +496,10 @@ def autocommit(self) -> bool:
452496
Returns:
453497
bool: True if autocommit is enabled, False otherwise.
454498
"""
455-
return self._conn.get_autocommit()
499+
try:
500+
return self._conn.get_autocommit()
501+
except RuntimeError as e:
502+
_raise_connection_error(e)
456503

457504
@autocommit.setter
458505
def autocommit(self, value: bool) -> None:
@@ -492,7 +539,10 @@ def setautocommit(self, value: bool = False) -> None:
492539
Raises:
493540
DatabaseError: If there is an error while setting the autocommit mode.
494541
"""
495-
self._conn.set_autocommit(value)
542+
try:
543+
self._conn.set_autocommit(value)
544+
except RuntimeError as e:
545+
_raise_connection_error(e)
496546

497547
def setencoding(self, encoding: Optional[str] = None, ctype: Optional[int] = None) -> None:
498548
"""
@@ -643,9 +693,13 @@ def setdecoding(
643693
sqltype (int): The SQL type being configured: SQL_CHAR, SQL_WCHAR, or SQL_WMETADATA.
644694
SQL_WMETADATA is a special flag for configuring column name decoding.
645695
encoding (str, optional): The Python encoding to use when decoding the data.
646-
If None, uses default encoding based on sqltype.
696+
If None, defaults to ``'utf-16le'`` for all sqltypes (SQL_CHAR,
697+
SQL_WCHAR, and SQL_WMETADATA), matching the connection-level
698+
defaults set in ``Connection.__init__``. Passing ``encoding=None``
699+
therefore resets the sqltype to its initial default.
647700
ctype (int, optional): The C data type to request from SQLGetData:
648-
SQL_CHAR or SQL_WCHAR. If None, uses default based on encoding.
701+
SQL_CHAR or SQL_WCHAR. If None, uses default based on encoding
702+
(SQL_WCHAR for UTF-16 variants, SQL_CHAR otherwise).
649703
650704
Returns:
651705
None
@@ -655,7 +709,10 @@ def setdecoding(
655709
InterfaceError: If the connection is closed.
656710
657711
Example:
658-
# Configure SQL_CHAR to use UTF-8 decoding
712+
# Reset SQL_CHAR to the connection default (utf-16le + SQL_WCHAR ctype)
713+
cnxn.setdecoding(mssql_python.SQL_CHAR)
714+
715+
# Configure SQL_CHAR to use UTF-8 decoding (opt-in, non-default)
659716
cnxn.setdecoding(mssql_python.SQL_CHAR, encoding='utf-8')
660717
661718
# Configure column metadata decoding
@@ -691,12 +748,15 @@ def setdecoding(
691748
),
692749
)
693750

694-
# Set default encoding based on sqltype if not provided
751+
# Set default encoding based on sqltype if not provided.
752+
# All sqltypes default to UTF-16LE to match Connection.__init__ defaults.
753+
# SQL_CHAR uses utf-16le + SQL_WCHAR ctype so the ODBC driver returns
754+
# UTF-16 data for VARCHAR columns, avoiding encoding mismatches on
755+
# Windows where the driver may otherwise return raw bytes in the
756+
# server's native code page (e.g. CP-1252). This makes
757+
# ``setdecoding(SQL_CHAR)`` with no arguments a true reset-to-defaults.
695758
if encoding is None:
696-
if sqltype == ConstantsDDBC.SQL_CHAR.value:
697-
encoding = "utf-8" # Default for SQL_CHAR in Python 3
698-
else: # SQL_WCHAR or SQL_WMETADATA
699-
encoding = "utf-16le" # Default for SQL_WCHAR in Python 3
759+
encoding = "utf-16le"
700760

701761
# Validate encoding using cached validation for better performance
702762
if not _validate_encoding(encoding):
@@ -1477,7 +1537,10 @@ def commit(self) -> None:
14771537
)
14781538

14791539
# Commit the current transaction
1480-
self._conn.commit()
1540+
try:
1541+
self._conn.commit()
1542+
except RuntimeError as e:
1543+
_raise_connection_error(e)
14811544
logger.info("Transaction committed successfully.")
14821545

14831546
def rollback(self) -> None:
@@ -1500,7 +1563,10 @@ def rollback(self) -> None:
15001563
)
15011564

15021565
# Roll back the current transaction
1503-
self._conn.rollback()
1566+
try:
1567+
self._conn.rollback()
1568+
except RuntimeError as e:
1569+
_raise_connection_error(e)
15041570
logger.info("Transaction rolled back successfully.")
15051571

15061572
def close(self) -> None:
@@ -1556,7 +1622,11 @@ def close(self) -> None:
15561622
# For autocommit True, this is not necessary as each statement is
15571623
# committed immediately
15581624
logger.debug("Rolling back uncommitted changes before closing connection.")
1559-
self._conn.rollback()
1625+
try:
1626+
self._conn.rollback()
1627+
except RuntimeError as e:
1628+
# Handle C++ layer RuntimeError with proper DB-API exception mapping
1629+
_raise_connection_error(e)
15601630
# TODO: Check potential race conditions in case of multithreaded scenarios
15611631
# Close the connection
15621632
self._conn.close()

mssql_python/cursor.py

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2054,19 +2054,27 @@ def _compute_column_type(self, column):
20542054
sample_value: Representative value for type inference and modified_row.
20552055
min_val: Minimum for integers (None otherwise).
20562056
max_val: Maximum for integers (None otherwise).
2057+
max_decimal_formatted_len: Maximum len(format(d, 'f')) across all
2058+
Decimal values in the column (0 when no Decimals are present).
2059+
Used by executemany to correct the SQL_VARCHAR column size when
2060+
the sample value's formatted string is shorter than another
2061+
value's (e.g. positive sample vs negative row value) (GH-557).
20572062
"""
20582063
non_nulls = [v for v in column if v is not None]
20592064
if not non_nulls:
2060-
return None, None, None
2065+
return None, None, None, 0
20612066

20622067
int_values = [v for v in non_nulls if isinstance(v, int)]
20632068
if int_values:
20642069
min_val, max_val = min(int_values), max(int_values)
20652070
sample_value = max(int_values, key=abs)
2066-
return sample_value, min_val, max_val
2071+
return sample_value, min_val, max_val, 0
20672072

20682073
sample_value = None
2074+
max_decimal_formatted_len = 0
20692075
for v in non_nulls:
2076+
if isinstance(v, decimal.Decimal):
2077+
max_decimal_formatted_len = max(max_decimal_formatted_len, len(format(v, "f")))
20702078
if not sample_value:
20712079
sample_value = v
20722080
elif isinstance(v, (str, bytes, bytearray)) and isinstance(
@@ -2120,7 +2128,7 @@ def _compute_column_type(self, column):
21202128
# If comparing Decimal to non-Decimal, prefer Decimal for better type inference
21212129
sample_value = v
21222130

2123-
return sample_value, None, None
2131+
return sample_value, None, None, max_decimal_formatted_len
21242132

21252133
def executemany( # pylint: disable=too-many-locals,too-many-branches,too-many-statements
21262134
self, operation: str, seq_of_parameters: Union[List[Sequence[Any]], List[Mapping[str, Any]]]
@@ -2225,7 +2233,7 @@ def executemany( # pylint: disable=too-many-locals,too-many-branches,too-many-s
22252233
if hasattr(seq_of_parameters, "__getitem__")
22262234
else []
22272235
)
2228-
sample_value, min_val, max_val = self._compute_column_type(column)
2236+
sample_value, min_val, max_val, _ = self._compute_column_type(column)
22292237

22302238
if self._inputsizes and col_index < len(self._inputsizes):
22312239
# Use explicitly set input sizes
@@ -2301,7 +2309,7 @@ def executemany( # pylint: disable=too-many-locals,too-many-branches,too-many-s
23012309
if hasattr(seq_of_parameters, "__getitem__")
23022310
else []
23032311
)
2304-
sample_value, min_val, max_val = self._compute_column_type(column)
2312+
sample_value, min_val, max_val, max_decimal_len = self._compute_column_type(column)
23052313

23062314
dummy_row = list(sample_row)
23072315
paraminfo = self._create_parameter_types_list(
@@ -2322,6 +2330,17 @@ def executemany( # pylint: disable=too-many-locals,too-many-branches,too-many-s
23222330
paraminfo.paramSQLType = ddbc_sql_const.SQL_VARCHAR.value
23232331
paraminfo.columnSize = 1
23242332

2333+
# Correct column size for Decimal columns sent as SQL_VARCHAR (GH-557).
2334+
# The sample value's formatted string may be shorter than another
2335+
# row's (e.g. positive sample "1.0" = 3 chars vs negative "-0.1" = 4).
2336+
# max_decimal_len was already computed during _compute_column_type
2337+
# so no extra iteration is needed.
2338+
if (
2339+
paraminfo.paramSQLType == ddbc_sql_const.SQL_VARCHAR.value
2340+
and max_decimal_len > paraminfo.columnSize
2341+
):
2342+
paraminfo.columnSize = max_decimal_len
2343+
23252344
# Special handling for binary data in auto-detected types
23262345
if paraminfo.paramSQLType in (
23272346
ddbc_sql_const.SQL_BINARY.value,
@@ -2462,8 +2481,9 @@ def fetchone(self) -> Union[None, Row]:
24622481
ret = ddbc_bindings.DDBCSQLFetchOne(
24632482
self.hstmt,
24642483
row_data,
2465-
char_decoding.get("encoding", "utf-8"),
2484+
char_decoding.get("encoding", "utf-16le"),
24662485
wchar_decoding.get("encoding", "utf-16le"),
2486+
char_decoding.get("ctype", ddbc_sql_const.SQL_WCHAR.value),
24672487
)
24682488

24692489
if self.hstmt:
@@ -2528,8 +2548,9 @@ def fetchmany(self, size: Optional[int] = None) -> List[Row]:
25282548
self.hstmt,
25292549
rows_data,
25302550
size,
2531-
char_decoding.get("encoding", "utf-8"),
2551+
char_decoding.get("encoding", "utf-16le"),
25322552
wchar_decoding.get("encoding", "utf-16le"),
2553+
char_decoding.get("ctype", ddbc_sql_const.SQL_WCHAR.value),
25332554
)
25342555

25352556
if self.hstmt:
@@ -2586,8 +2607,9 @@ def fetchall(self) -> List[Row]:
25862607
ret = ddbc_bindings.DDBCSQLFetchAll(
25872608
self.hstmt,
25882609
rows_data,
2589-
char_decoding.get("encoding", "utf-8"),
2610+
char_decoding.get("encoding", "utf-16le"),
25902611
wchar_decoding.get("encoding", "utf-16le"),
2612+
char_decoding.get("ctype", ddbc_sql_const.SQL_WCHAR.value),
25912613
)
25922614

25932615
# Check for errors

mssql_python/pybind/CMakeLists.txt

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,25 @@ endif()
215215

216216
message(STATUS "Final Python library directory: ${PYTHON_LIB_DIR}")
217217

218+
find_package(simdutf CONFIG QUIET)
219+
220+
if(NOT simdutf_FOUND)
221+
include(FetchContent)
222+
message(STATUS "simdutf not found via find_package; downloading v8.2.0 source archive with FetchContent")
223+
set(simdutf_fetchcontent_args
224+
URL https://github.com/simdutf/simdutf/archive/refs/tags/v8.2.0.tar.gz
225+
URL_HASH SHA256=033a91b1d7d1cb818c1eff49e61faaa1b64a3a530d59ef9efef0195e56bda8b1
226+
)
227+
if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
228+
list(APPEND simdutf_fetchcontent_args DOWNLOAD_EXTRACT_TIMESTAMP FALSE)
229+
endif()
230+
FetchContent_Declare(simdutf ${simdutf_fetchcontent_args})
231+
set(SIMDUTF_TESTS OFF CACHE BOOL "Disable simdutf tests" FORCE)
232+
set(SIMDUTF_TOOLS OFF CACHE BOOL "Disable simdutf tools" FORCE)
233+
set(SIMDUTF_BENCHMARKS OFF CACHE BOOL "Disable simdutf benchmarks" FORCE)
234+
FetchContent_MakeAvailable(simdutf)
235+
endif()
236+
218237
set(DDBC_SOURCE "ddbc_bindings.cpp")
219238
message(STATUS "Using standard source file: ${DDBC_SOURCE}")
220239
# Include connection module and logger bridge
@@ -293,6 +312,8 @@ else()
293312
endif()
294313
endif()
295314

315+
target_link_libraries(ddbc_bindings PRIVATE simdutf::simdutf)
316+
296317
# Compiler definitions
297318
target_compile_definitions(ddbc_bindings PRIVATE
298319
HAVE_SNPRINTF

mssql_python/pybind/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,3 +184,6 @@ Examples:
184184
- Linux x86_64: `ddbc_bindings.cp311-x86_64.so`
185185
- Linux ARM64: `ddbc_bindings.cp311-arm64.so`
186186

187+
# String Handling for Unicode Data
188+
189+
Use std::u16string or the Python C-API when converting between SQLWCHAR data and Python strings. Use simdutf for any pure c++ UTF transcoding. Do not introduce std::wstring in the C++ bindings.

0 commit comments

Comments
 (0)