Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
python-version: '3.11'

- name: Install uv
uses: astral-sh/setup-uv@v4
Expand All @@ -46,7 +46,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
python-version: '3.11'

- name: Install uv
uses: astral-sh/setup-uv@v4
Expand All @@ -65,7 +65,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ['3.10', '3.11', '3.12']
python-version: ['3.11', '3.12', '3.13']

steps:
- name: Checkout code
Expand Down Expand Up @@ -95,8 +95,8 @@ jobs:

- name: Run tests
run: |
if [ "${{ matrix.python-version }}" == "3.10" ]; then
# Run with coverage on Python 3.10
if [ "${{ matrix.python-version }}" == "3.11" ]; then
# Run with coverage on Python 3.11
uv run python -m pytest tests/ --cov=tealflow_mcp --cov-report=xml --cov-report=term-missing -v
else
# Run without coverage on other versions
Expand Down
14 changes: 7 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ build-backend = "hatchling.build"

[project]
name = "tealflow-mcp"
version = "0.2.0.dev1"
version = "0.2.0.dev2"
description = "MCP server for discovering, understanding, and generating Teal R Shiny applications for clinical trial data analysis"
readme = "README.md"
requires-python = ">=3.10"
requires-python = ">=3.11"
license = { text = "AGPL-3.0-only" }
authors = [
{ name = "Jakub Nowicki", email = "[email protected]" },
Expand Down Expand Up @@ -35,17 +35,17 @@ classifiers = [
"Intended Audience :: Science/Research",
"License :: OSI Approved :: GNU Affero General Public License v3",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: Scientific/Engineering",
]
dependencies = [
"mcp>=1.0.0",
"pydantic>=2.0.0",
"pandas>=2.0.0",
"pyreadr>=0.5.0",
"rdata>=1.0.0",
]

[project.urls]
Expand Down Expand Up @@ -78,7 +78,7 @@ docs = [
[tool.ruff]
# Set the maximum line length to 100
line-length = 100
target-version = "py310"
target-version = "py311"

# Exclude common directories
exclude = [
Expand Down Expand Up @@ -135,7 +135,7 @@ skip-magic-trailing-comma = false
line-ending = "auto"

[tool.mypy]
python_version = "3.10"
python_version = "3.11"
warn_unused_configs = true
disallow_untyped_defs = false
disallow_incomplete_defs = false
Expand All @@ -152,7 +152,7 @@ strict_optional = false
module = [
"mcp.*",
"fastmcp.*",
"pyreadr.*",
"rdata.*",
"pandas.*",
]
ignore_missing_imports = true
2 changes: 1 addition & 1 deletion tealflow_mcp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
- Generating R code for Teal apps
"""

__version__ = "0.2.0.dev1"
__version__ = "0.2.0.dev2"

from .core import PackageFilter, ResponseFormat
from .models import (
Expand Down
157 changes: 140 additions & 17 deletions tealflow_mcp/utils/dataset_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,77 @@
row count, and file metadata.
"""

import datetime
import warnings
from collections.abc import Mapping
from dataclasses import dataclass
from pathlib import Path
from typing import Any

import numpy as np
import pandas as pd
import pyreadr
import rdata
from rdata.conversion import DEFAULT_CLASS_MAP, SimpleConverter


def _date_constructor(obj: Any, attrs: Mapping[str, Any]) -> Any:
"""
Custom constructor for R Date class.

R Date stores dates as days since 1970-01-01.
Converts to pandas datetime64[ns] with date precision.
Handles NaN values without triggering RuntimeWarning.
Stores original R class in Series.attrs for accurate type inference.
"""
obj_array = np.asarray(obj, dtype=float)
mask = ~np.isnan(obj_array)

# Create result series with NaT for all positions
result = pd.Series(pd.NaT, index=range(len(obj_array)), dtype="datetime64[ns]")

# Convert only non-NaN values
if mask.any():
origin = pd.Timestamp("1970-01-01")
result[mask] = pd.to_datetime(obj_array[mask], unit="D", origin=origin, errors="coerce")

# Store original R class in attrs for accurate type inference
# This prevents misclassification of POSIXct columns that happen to have all midnight values
result.attrs["r_class"] = "Date"

return result


def _posixct_constructor(obj: Any, attrs: Mapping[str, Any]) -> Any:
"""
Custom constructor for R POSIXct class.

R POSIXct stores datetimes as seconds since 1970-01-01 UTC.
Converts to pandas datetime64[ns].
Handles NaN values without triggering RuntimeWarning.
Stores original R class in Series.attrs for accurate type inference.
"""
obj_array = np.asarray(obj, dtype=float)
mask = ~np.isnan(obj_array)

# Create result series with NaT for all positions
result = pd.Series(pd.NaT, index=range(len(obj_array)), dtype="datetime64[ns]")

# Convert only non-NaN values
if mask.any():
result[mask] = pd.to_datetime(obj_array[mask], unit="s", errors="coerce")

# Store original R class in attrs for accurate type inference
# This prevents misclassification of POSIXct columns that happen to have all midnight values
result.attrs["r_class"] = "POSIXct"

return result


# Custom class map with Date and POSIXct support
_RDATA_CLASS_MAP = DEFAULT_CLASS_MAP.copy()
_RDATA_CLASS_MAP["Date"] = _date_constructor
_RDATA_CLASS_MAP["POSIXct"] = _posixct_constructor
_RDATA_CLASS_MAP["POSIXt"] = _posixct_constructor # POSIXt is parent class


@dataclass
Expand Down Expand Up @@ -45,7 +110,7 @@ def _infer_object_type(col_data: pd.Series) -> str:
"""
Infer the R type for an object dtype column.

pyreadr converts numeric columns with NA values to object dtype.
R data readers may convert numeric columns with NA values to object dtype.
This function checks if non-null values can be converted to numeric or are date objects.

Args:
Expand All @@ -54,8 +119,6 @@ def _infer_object_type(col_data: pd.Series) -> str:
Returns:
"integer", "numeric", "date", or "character"
"""
import datetime

# Get non-null values
non_null = col_data.dropna()

Expand Down Expand Up @@ -91,9 +154,64 @@ def _infer_object_type(col_data: pd.Series) -> str:
return "character"


def _infer_datetime_type(col_data: pd.Series) -> str:
"""
Infer whether a datetime column is R Date or POSIXct.

First checks Series.attrs for the original R class stored during rdata conversion.
This prevents misclassification of POSIXct columns that legitimately have all
midnight timestamps.

Falls back to heuristic (time component check) for datetime data from other sources
(e.g., CSV files, manual series creation).

Args:
col_data: pandas Series with datetime dtype

Returns:
"date" or "POSIXct"
"""
# Check if original R class is stored in attrs (from rdata conversion)
if hasattr(col_data, "attrs") and "r_class" in col_data.attrs:
r_class = col_data.attrs["r_class"]
if r_class == "Date":
return "date"
elif r_class == "POSIXct":
return "POSIXct"

# Fallback: use heuristic for data from other sources (e.g., CSV, manual creation)
# Get non-null values
non_null = col_data.dropna()

if len(non_null) == 0:
# Default to POSIXct for empty columns
return "POSIXct"

# Sample values to check for time component
sample = non_null.head(min(100, len(non_null)))

# Check if all times are exactly midnight (00:00:00)
# This heuristic suggests it's likely a Date (date-only) rather than POSIXct
# Note: This can misclassify POSIXct with all midnight values, but that's
# acceptable for non-R data sources where we don't have the original class info
try:
# For datetime64, check if time components are all zero
times = pd.to_datetime(sample)
all_midnight = (
(times.dt.hour == 0).all()
and (times.dt.minute == 0).all()
and (times.dt.second == 0).all()
and (times.dt.microsecond == 0).all()
and (times.dt.nanosecond == 0).all()
)
return "date" if all_midnight else "POSIXct"
except Exception:
return "POSIXct"


def _read_rds_dataset(file_path: Path, include_sample_values: bool = False) -> DatasetInfo:
"""
Read dataset information from an RDS file using pyreadr.
Read dataset information from an RDS file using rdata.

Args:
file_path: Path to the RDS file
Expand All @@ -106,40 +224,45 @@ def _read_rds_dataset(file_path: Path, include_sample_values: bool = False) -> D
ValueError: If the file cannot be read or is not a valid RDS file
"""
try:
# Read RDS file using pyreadr
# Suppress RuntimeWarning from pyreadr's datetime conversion with NaT values
# Read RDS file using rdata with custom converters for Date/POSIXct
# Parse the file first, then convert with our custom class map
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="invalid value encountered in cast")
result = pyreadr.read_r(str(file_path))
warnings.filterwarnings("ignore", category=UserWarning)
parsed = rdata.parser.parse_file(file_path)
converter = SimpleConverter(constructor_dict=_RDATA_CLASS_MAP)
df = converter.convert(parsed)

# pyreadr returns a dict, get the first (and usually only) dataframe
if not result:
# Verify we got a DataFrame
if df is None:
raise ValueError("RDS file contains no data")

# Get the first dataframe
df = next(iter(result.values()))
if not isinstance(df, pd.DataFrame):
raise ValueError(f"RDS file does not contain a DataFrame, got {type(df).__name__}")

# Extract column information
columns = []
for col_name in df.columns:
col_data = df[col_name]

# Get pandas dtype and convert to R-like type name
dtype = str(col_data.dtype)
dtype = str(col_data.dtype).lower()

# Map pandas dtypes to R-like types
# Note: rdata uses nullable integer types (Int32, Int64) which become lowercase here
if dtype.startswith("int"):
r_type = "integer"
elif dtype.startswith("float"):
r_type = "numeric"
elif dtype == "object":
# For object dtype, try to infer if it's actually numeric
# pyreadr converts numeric columns with NAs to object dtype
# R data readers may convert numeric columns with NAs to object dtype
r_type = _infer_object_type(col_data)
elif dtype == "bool":
elif dtype in ("bool", "boolean"):
r_type = "logical"
elif dtype.startswith("datetime"):
r_type = "POSIXct"
# Distinguish between Date (date-only) and POSIXct (datetime)
# Date columns have no time component (all times are midnight)
r_type = _infer_datetime_type(col_data)
else:
r_type = dtype

Expand Down
Loading
Loading