Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/tutorials/python/table.md
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,12 @@ Qux2,4,203001,204001,+,False
table.delete_rows(query=f"SELECT * FROM {table.id} WHERE Strand = '+'")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A few lines up there is a part of this markdown that needs to be updated: Deleting specific rows - Query for the rows you want to delete and call syn.delete on the results

Could you update this please?

```

* Or deleting rows based on a dataframe, where the ROW_ID and ROW_VERSION columns specify the rows to be deleted from the table. In this example, rows 2 and 3 are deleted. See this document that describes the expected columns of the dataframe: <https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/table/Row.html>. Note: The ROW_VERSION begins at 1 upon row creation and increases by one with every subsequent update.

```python
table.delete_rows(df = pd.DataFrame({"ROW_ID": [2, 3], "ROW_VERSION": [1, 1]}))
```

* Deleting the whole table will deletes the whole table and all rows

```python
Expand Down
82 changes: 66 additions & 16 deletions synapseclient/models/mixins/table_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -1765,10 +1765,10 @@ def _construct_partial_rows_for_upsert(
if (isinstance(cell_value, list) and len(cell_value) > 0) or not isna(
cell_value
):
partial_change_values[
column_id
] = _convert_pandas_row_to_python_types(
cell=cell_value, column_type=column_type
partial_change_values[column_id] = (
_convert_pandas_row_to_python_types(
cell=cell_value, column_type=column_type
)
)
else:
partial_change_values[column_id] = None
Expand Down Expand Up @@ -4049,31 +4049,38 @@ class TableDeleteRowMixin:

async def delete_rows_async(
self,
query: str,
query: Optional[str] = None,
df: Optional[DATA_FRAME_TYPE] = None,
*,
job_timeout: int = 600,
synapse_client: Optional[Synapse] = None,
) -> DATA_FRAME_TYPE:
"""
Delete rows from a table given a query to select rows. The query at a
minimum must select the `ROW_ID` and `ROW_VERSION` columns. If you want to
Delete rows from a table given a query or a pandas dataframe to select rows.
The query at a minimum must select the `ROW_ID` and `ROW_VERSION` columns. If you want to
inspect the data that will be deleted ahead of time you may use the
`.query` method to get the data.

The dataframe must at least contain the `ROW_ID` and `ROW_VERSION` columns. And `ROW_ETAG` column is also required
if the entity is one of the following: `EntityView`, `Dataset`, `DatasetCollection`, or `SubmissionView`.
If both query and df are provided, the query will be used.

Arguments:
query: The query to select the rows to delete. The query at a minimum
must select the `ROW_ID` and `ROW_VERSION` columns. See this document
that describes the expected syntax of the query:
<https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/web/controller/TableExamples.html>
df: A pandas dataframe that contains the rows to delete. The dataframe must at least contain the `ROW_ID` and `ROW_VERSION` columns.
If the entity is one of the following: `EntityView`, `Dataset`, `DatasetCollection`, or `SubmissionView` then the dataframe must also contain the `ROW_ETAG` column.
See this document that describes the expected columns of the dataframe:
<https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/table/Row.html>
job_timeout: The amount of time to wait for table updates to complete
before a `SynapseTimeoutError` is thrown. The default is 600 seconds.
synapse_client: If not passed in and caching was not disabled by
`Synapse.allow_client_caching(False)` this will use the last created
instance from the Synapse class constructor.

Returns:
The results of your query for the rows that were deleted from the table.
The results of your query or dataframe for the rows that were deleted from the table.

Example: Selecting a row to delete
This example shows how you may select a row to delete from a table.
Expand Down Expand Up @@ -4109,17 +4116,60 @@ async def main():

asyncio.run(main())
```

Example: Selecting rows to delete using a dataframe
This example shows how you may select a row to delete from a table based on a dataframe.

```python
import asyncio
import pandas as pd
from synapseclient import Synapse
from synapseclient.models import Table # Also works with `Dataset`

syn = Synapse()
syn.login()

# Creating a pandas dataframe that contains the rows to delete.
# In this example, we create a dataframe that specifies the first two rows of the table for deletion.
# Assuming no changes have been made to the table so the ROW_VERSION is 1.

df = pd.DataFrame({"ROW_ID": [1, 2], "ROW_VERSION": [1, 1]})
async def main():
await Table(id="syn1234").delete_rows_async(df)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@danlu1 I got an error when running the example, here it should be:

df = pd.DataFrame({"ROW_ID": [1, 2], "ROW_VERSION": [1, 1]})
            async def main():
                await Table(id="syn1234").delete_rows_async(df=df)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice catch. Yes, the param should be specified. I will update.


asyncio.run(main())
```
"""
client = Synapse.get_client(synapse_client=synapse_client)
results_from_query = await self.query_async(query=query, synapse_client=client)
client.logger.info(
f"Found {len(results_from_query)} rows to delete for given query: {query}"
)
# check if both query and df are None
if query is None and df is None:
raise ValueError("Either query or df must be provided.")
if query is not None:
rows_to_delete = await self.query_async(query=query, synapse_client=client)
client.logger.info(
f"Found {len(rows_to_delete)} rows to delete for given query: {query}"
)
elif df is not None:
rows_to_delete = df
client.logger.info(
f"Found {len(rows_to_delete)} rows to delete for given dataframe."
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
f"Found {len(rows_to_delete)} rows to delete for given dataframe."
f"Received {len(rows_to_delete)} rows to delete for given dataframe."

)
if (
"ROW_ID" not in rows_to_delete.columns
or "ROW_VERSION" not in rows_to_delete.columns
):
raise ValueError(
"The dataframe must contain the 'ROW_ID' and 'ROW_VERSION' columns."
)
Comment on lines +4157 to +4163
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this be put before the log message is printed so the validation occurs first?


if self.__class__.__name__ in CLASSES_THAT_CONTAIN_ROW_ETAG:
Copy link
Contributor Author

@danlu1 danlu1 Oct 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@BryanFauble When I add test case for the below ValueError, I got the AttributeError: 'Dataset' object has no attribute 'delete_rows_async' for my test case of Dataset. As this line, I believe we originally intent to use delete_row in the Dataset, EntityView, DatasetCollection, and SubmissionView but I didn't find the function in any of the models. Have we not had the chance to add this function yet?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So these other items are special @danlu1 . You can't actually delete rows of those Entity types, the reason is because there is a separate items attribute on the entity itself that dictates what should be present on the table:

https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/table/Dataset.html

If you notice that:

class Dataset(
    DatasetSynchronousProtocol,
    AccessControllable,
    ViewBase,
    ViewStoreMixin,
    DeleteMixin,
    ColumnMixin,
    GetMixin,
    QueryMixin,
    ViewUpdateMixin,
    ViewSnapshotMixin,
):

Doesn't contain the TableDeleteRowMixin which gives this ability to the class

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for confirming!

filtered_columns = results_from_query[["ROW_ID", "ROW_VERSION", "ROW_ETAG"]]
if "ROW_ETAG" not in rows_to_delete.columns:
raise ValueError(
f"The dataframe must contain the 'ROW_ETAG' column when deleting rows from a {self.__class__.__name__}."
)
filtered_columns = rows_to_delete[["ROW_ID", "ROW_VERSION", "ROW_ETAG"]]
else:
filtered_columns = results_from_query[["ROW_ID", "ROW_VERSION"]]
filtered_columns = rows_to_delete[["ROW_ID", "ROW_VERSION"]]

filepath = f"{tempfile.mkdtemp()}/{self.id}_upload_{uuid.uuid4()}.csv"
try:
Expand All @@ -4138,7 +4188,7 @@ async def main():
entity_id=self.id, changes=[upload_request]
).send_job_and_wait_async(synapse_client=client, timeout=job_timeout)

return results_from_query
return rows_to_delete


def infer_column_type_from_data(values: DATA_FRAME_TYPE) -> List[Column]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1683,7 +1683,7 @@ def init(self, syn: Synapse, schedule_for_cleanup: Callable[..., None]) -> None:
self.syn = syn
self.schedule_for_cleanup = schedule_for_cleanup

async def test_delete_single_row(self, project_model: Project) -> None:
async def test_delete_single_row_via_query(self, project_model: Project) -> None:
# GIVEN a table in Synapse
table_name = str(uuid.uuid4())
table = Table(
Expand Down Expand Up @@ -1720,7 +1720,7 @@ async def test_delete_single_row(self, project_model: Project) -> None:
# AND only 2 rows should exist on the table
assert len(results) == 2

async def test_delete_multiple_rows(self, project_model: Project) -> None:
async def test_delete_multiple_rows_via_query(self, project_model: Project) -> None:
# GIVEN a table in Synapse
table_name = str(uuid.uuid4())
table = Table(
Expand Down Expand Up @@ -1757,7 +1757,7 @@ async def test_delete_multiple_rows(self, project_model: Project) -> None:
# AND only 1 row should exist on the table
assert len(results) == 1

async def test_delete_no_rows(self, project_model: Project) -> None:
async def test_delete_no_rows_via_query(self, project_model: Project) -> None:
# GIVEN a table in Synapse
table_name = str(uuid.uuid4())
table = Table(
Expand Down Expand Up @@ -1793,6 +1793,45 @@ async def test_delete_no_rows(self, project_model: Project) -> None:
# AND 3 rows should exist on the table
assert len(results) == 3

async def test_delete_multiple_rows_via_dataframe(
self, project_model: Project
) -> None:
# GIVEN a table in Synapse
table_name = str(uuid.uuid4())
table = Table(
name=table_name,
parent_id=project_model.id,
columns=[Column(name="column_string", column_type=ColumnType.STRING)],
)
table = await table.store_async(synapse_client=self.syn)
self.schedule_for_cleanup(table.id)

# AND data for a column already stored in Synapse
data_for_table = pd.DataFrame({"column_string": ["value1", "value2", "value3"]})
await table.store_rows_async(
values=data_for_table, schema_storage_strategy=None, synapse_client=self.syn
)
# Get the ROW_ID and ROW_VERSION for the data we just added
# WHEN I delete rows from the table using a dataframe
await table.delete_rows_async(
df=pd.DataFrame({"ROW_ID": [2, 3], "ROW_VERSION": [1, 1]}),
synapse_client=self.syn,
)

# AND I query the table
results = await query_async(
f"SELECT * FROM {table.id}", synapse_client=self.syn
)

# THEN the data in the columns should match
pd.testing.assert_series_equal(
results["column_string"],
pd.DataFrame({"column_string": ["value1"]})["column_string"],
)

# AND only 1 row should exist on the table
assert len(results) == 1


class TestColumnModifications:
@pytest.fixture(autouse=True, scope="function")
Expand Down
111 changes: 110 additions & 1 deletion tests/unit/synapseclient/mixins/unit_test_table_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -751,6 +751,7 @@ async def test_delete_with_name_and_parent_id(self):
async def test_delete_with_no_id_or_name_and_parent_id(self):
# GIVEN a TestClass instance with no id or name and parent_id
test_instance = self.ClassForTest()
test_instance.__name__ = ""

with pytest.raises(
ValueError,
Expand Down Expand Up @@ -1791,7 +1792,7 @@ class ClassForTest(TableDeleteRowMixin, QueryMixin):
name: Optional[str] = "test_table"
columns: Dict[str, Column] = field(default_factory=dict)

async def test_delete_rows_async(self):
async def test_delete_rows_async_via_query(self):
# GIVEN a TestClass instance
test_instance = self.ClassForTest()
with (
Expand All @@ -1811,12 +1812,17 @@ async def test_delete_rows_async(self):
entity_id=test_instance.id, changes=[]
),
) as mock_send_job_and_wait_async,
patch.object(self.syn.logger, "info") as mock_logger_info,
):
# WHEN I call delete_rows_async
result = await test_instance.delete_rows_async(
query=self.fake_query, synapse_client=self.syn
)

# THEN mock_logger_info should be called
mock_logger_info.assert_called_once_with(
f"Found 2 rows to delete for given query: {self.fake_query}"
)
# THEN mock_query_async should be called
mock_query_async.assert_awaited_once_with(
query=self.fake_query, synapse_client=self.syn
Expand All @@ -1834,6 +1840,109 @@ async def test_delete_rows_async(self):
pd.DataFrame({"ROW_ID": ["A", "B"], "ROW_VERSION": [1, 2]})
)

async def test_delete_rows_async_via_dataframe(self):
# GIVEN a TestClass instance
test_instance = self.ClassForTest()
df = pd.DataFrame({"ROW_ID": ["A", "B"], "ROW_VERSION": [1, 2]})
with (
patch(
"synapseclient.models.mixins.table_components.multipart_upload_file_async",
return_value="fake_file_handle_id",
) as mock_multipart_upload_file_async,
patch(
SEND_JOB_AND_WAIT_ASYNC_PATCH,
return_value=TableUpdateTransaction(
entity_id=test_instance.id, changes=[]
),
) as mock_send_job_and_wait_async,
patch.object(self.syn.logger, "info") as mock_logger_info,
):
# WHEN I call delete_rows_async
result = await test_instance.delete_rows_async(
df=df, synapse_client=self.syn
)

# THEN mock_logger_info should be called
mock_logger_info.assert_called_once_with(
f"Found 2 rows to delete for given dataframe."
)
# AND mock_multipart_upload_file_async should be called
mock_multipart_upload_file_async.assert_awaited_once()
# AND mock_send_job_and_wait_async should be called
mock_send_job_and_wait_async.assert_awaited_once_with(
synapse_client=self.syn,
timeout=600,
)

# AND the result should be the expected dataframe object
assert result.equals(
pd.DataFrame({"ROW_ID": ["A", "B"], "ROW_VERSION": [1, 2]})
)

@pytest.mark.parametrize(
"df, error_msg",
[
(
pd.DataFrame(columns=["ROW_ID"]), # Missing ROW_VERSION column
"The dataframe must contain the 'ROW_ID' and 'ROW_VERSION' columns.",
),
(
pd.DataFrame(columns=["ROW_VERSION"]), # Missing ROW_ID column
"The dataframe must contain the 'ROW_ID' and 'ROW_VERSION' columns.",
),
(
pd.DataFrame(columns=["INVALID_COL", "ROW_VERSION"]), # Invalid column
"The dataframe must contain the 'ROW_ID' and 'ROW_VERSION' columns.",
),
(
pd.DataFrame(columns=["ROW_ID", "INVALID_COL"]), # Invalid column
"The dataframe must contain the 'ROW_ID' and 'ROW_VERSION' columns.",
),
(
pd.DataFrame(columns=["INVALID_COL1", "INVALID_COL2"]), # Both invalid
"The dataframe must contain the 'ROW_ID' and 'ROW_VERSION' columns.",
),
],
)
async def test_delete_rows_via_dataframe_missing_required_column_table_entity(
self, df, error_msg
):
# GIVEN a TestClass instance
test_instance = self.ClassForTest()
# WHEN I call delete_rows_async
with pytest.raises(ValueError, match=error_msg):
result = await test_instance.delete_rows_async(
df=df, synapse_client=self.syn
)

@pytest.mark.parametrize(
"df, error_msg",
[
(
pd.DataFrame(columns=["ROW_ID"]), # Missing ROW_VERSION column
"The dataframe must contain the 'ROW_ID' and 'ROW_VERSION' columns.",
),
(
pd.DataFrame(
columns=["ROW_ID", "ROW_VERSION"]
), # Missing ROW_ETAG column
"The dataframe must contain the 'ROW_ETAG' column when deleting rows from a Dataset.",
),
],
)
async def test_delete_rows_via_dataframe_missing_required_column_dataset_entity(
self, df, error_msg
):
# GIVEN a Dataset instance
from synapseclient.models import Dataset

test_instance = Dataset()
# WHEN I call delete_rows_async
with pytest.raises(ValueError, match=error_msg):
result = await test_instance.delete_rows_async(
df=df, synapse_client=self.syn
)


class TestQueryTableCsv:
"""Test suite for the _query_table_csv function."""
Expand Down
Loading