From 80ea8f07205207517375417dd33e73587cda2104 Mon Sep 17 00:00:00 2001 From: julio Almeida Date: Wed, 2 Oct 2024 13:36:47 +0100 Subject: [PATCH 01/12] before changing the approach --- extract_thinker/llm.py | 59 +++++-- extract_thinker/models/MaskContract.py | 6 + extract_thinker/process.py | 230 ++++++++++++++++++++++++- tests/test_process.py | 125 ++++++++++++++ 4 files changed, 402 insertions(+), 18 deletions(-) create mode 100644 extract_thinker/models/MaskContract.py create mode 100644 tests/test_process.py diff --git a/extract_thinker/llm.py b/extract_thinker/llm.py index bbec2ea..c8575bc 100644 --- a/extract_thinker/llm.py +++ b/extract_thinker/llm.py @@ -1,23 +1,41 @@ -from typing import List, Dict, Any +from typing import List, Dict, Any, Optional import instructor import litellm from extract_thinker.utils import num_tokens_from_string from litellm import Router - class LLM: - def __init__(self, model: str, api_base: str = None, api_key: str = None, api_version: str = None): - self.client = instructor.from_litellm(litellm.completion, mode=instructor.Mode.MD_JSON) + def __init__( + self, + model: str, + api_base: str = None, + api_key: str = None, + api_version: str = None, + ): self.model = model self.router = None self.api_base = api_base self.api_key = api_key self.api_version = api_version + # Initialize Instructor client + self.instructor_client = instructor.from_litellm( + litellm.completion, + mode=instructor.Mode.MD_JSON, + api_base=self.api_base, + api_key=self.api_key, + api_version=self.api_version + ) + def load_router(self, router: Router) -> None: self.router = router - def request(self, messages: List[Dict[str, str]], response_model: str) -> Any: + def request( + self, + messages: List[Dict[str, str]], + response_model: Optional[str] = None + ) -> Any: + # Uncomment the following lines if you need to calculate max_tokens # contents = map(lambda message: message['content'], messages) # all_contents = ' '.join(contents) # max_tokens = num_tokens_from_string(all_contents) @@ -25,19 +43,28 @@ def request(self, messages: List[Dict[str, str]], response_model: str) -> Any: if self.router: response = self.router.completion( model=self.model, - #max_tokens=max_tokens, + # max_tokens=max_tokens, messages=messages, response_model=response_model, ) else: - response = self.client.chat.completions.create( - model=self.model, - #max_tokens=max_tokens, - messages=messages, - response_model=response_model, - api_base=self.api_base, - api_key=self.api_key, - api_version=self.api_version - ) + if response_model: + # Use Instructor client for structured responses + response = self.instructor_client.chat.completions.create( + model=self.model, + # max_tokens=max_tokens, + messages=messages, + response_model=response_model, + api_base=self.api_base, + api_key=self.api_key, + api_version=self.api_version + ) + else: + # Use LiteLLM client for unstructured responses + response = litellm.completion( + model=self.model, + # max_tokens=max_tokens, + messages=messages + ) - return response + return response \ No newline at end of file diff --git a/extract_thinker/models/MaskContract.py b/extract_thinker/models/MaskContract.py new file mode 100644 index 0000000..f8a15ad --- /dev/null +++ b/extract_thinker/models/MaskContract.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel, Field +from typing import Dict, List + +class MaskContract(BaseModel): + masked_text: str = Field(description="The masked version of the text") + mapping: Dict[str, str] = Field(description="A dictionary mapping placeholders to original values") \ No newline at end of file diff --git a/extract_thinker/process.py b/extract_thinker/process.py index 8542701..6a5ff86 100644 --- a/extract_thinker/process.py +++ b/extract_thinker/process.py @@ -12,10 +12,10 @@ DocGroups, ) from extract_thinker.utils import get_image_type - +from extract_thinker.llm import LLM +from extract_thinker.models.MaskContract import MaskContract from enum import Enum - class ClassificationStrategy(Enum): CONSENSUS = "consensus" HIGHER_ORDER = "higher_order" @@ -33,6 +33,232 @@ def __init__(self): self.file_path: Optional[str] = None self.file_stream: Optional[IO] = None self.splitter: Optional[Splitter] = None + self.masking_llm: Optional[LLM] = None + self.masking_enabled: bool = False + + def add_masking_llm(self, model: Optional[str] = None) -> None: + self.masking_enabled = True + + if isinstance(model, LLM): + self.masking_llm = model + elif model is not None: + self.masking_llm = LLM(model) + else: + raise ValueError("Either a model string or an LLM object must be provided.") + + async def mask_content(self, content: str) -> MaskContract: + if not self.masking_enabled or not self.masking_llm: + raise ValueError("Masking is not enabled, please set a masking llm with add_masking_llm") + + # Step 1: Get masked text and placeholder list + messages_step1 = [ + { + "role": "system", + "content": "You are an AI assistant that masks sensitive information in text." + }, + { + "role": "user", + "content": f""" + Please mask all sensitive information in the following text. Replace sensitive information with placeholders like [PERSON1], [PERSON2], [ADDRESS1], [ADDRESS2], [PHONE1], [PHONE2], etc. Return the masked text and a list of placeholders with their original values. + + Here are some examples: + + Example 1: + Original text: + John Smith lives at 123 Main St, New York, NY 10001. His phone number is (555) 123-4567. + + Placeholder list: + [PERSON1]: John Smith + [ADDRESS1]: 123 Main St, New York, NY 10001 + [PHONE1]: (555) 123-4567 + + Masked text: + [PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]. + + Example 2: + Original text: + Sarah Johnson ordered a laptop from TechStore on 2023-05-15. Her email is sarah.j@email.com. + + Placeholder list: + [PERSON1]: Sarah Johnson + [PRODUCT1]: laptop + [STORE1]: TechStore + [DATE1]: 2023-05-15 + [EMAIL1]: sarah.j@email.com + + Masked text: + [PERSON1] ordered a [PRODUCT1] from [STORE1] on [DATE1]. Her email is [EMAIL1]. + + Example 3: + Original text: + Dr. Emily Brown, born on 1985-03-22, works at Central Hospital. Her patient, Mr. David Lee, has an appointment on 2023-06-10 at 2:30 PM. + + Placeholder list: + [PERSON1]: Dr. Emily Brown + [DATE1]: 1985-03-22 + [HOSPITAL1]: Central Hospital + [PERSON2]: Mr. David Lee + [DATE2]: 2023-06-10 + [TIME1]: 2:30 PM + + Masked text: + [PERSON1], born on [DATE1], works at [HOSPITAL1]. Her patient, [PERSON2], has an appointment on [DATE2] at [TIME1]. + + Now, please mask the following text: + + Text to mask: + {content} + + Give me the placeholder list with the value and respective placeholder, and then the Masked text with the placeholders. + """ + } + ] + + response_step1 = self.masking_llm.request(messages_step1) + + response_step1_content = response_step1.choices[0].message.content + + # Step 2: Convert to JSON format + messages_step2 = [ + { + "role": "system", + "content": "You are an AI assistant that converts masked text information into JSON format." + }, + { + "role": "user", + "content": f""" + Convert the following masked texts and placeholder lists into a JSON format. For each example, the JSON should have two main keys: "mapping" (a dictionary of placeholders and their original values) and "masked_text" (the text with placeholders). + Always use [], not "" or '' + Make sure that masked_text contains no sensitive information, only the placeholders. + + Example 1: + Placeholder list: + [PERSON1]: John Smith + [ADDRESS1]: 123 Main St, New York, NY 10001 + [PHONE1]: (555) 123-4567 + + Masked text: + [PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]. + + Output: + {{ + "mapping": {{ + "[PERSON1]": "John Smith", + "[ADDRESS1]": "123 Main St, New York, NY 10001", + "[PHONE1]": "(555) 123-4567" + }}, + "masked_text": "[PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]." + }} + + Example 2: + Placeholder list: + [PERSON1]: Sarah Johnson + [PRODUCT1]: laptop + [STORE1]: TechStore + [DATE1]: 2023-05-15 + [EMAIL1]: sarah.j@email.com + + Masked text: + [PERSON1] ordered a [PRODUCT1] from [STORE1] on [DATE1]. Her email is [EMAIL1]. + + Output: + {{ + "mapping": {{ + "[PERSON1]": "Sarah Johnson", + "[PRODUCT1]": "laptop", + "[STORE1]": "TechStore", + "[DATE1]": "2023-05-15", + "[EMAIL1]": "sarah.j@email.com" + }}, + "masked_text": "[PERSON1] ordered a [PRODUCT1] from [STORE1] on [DATE1]. Her email is [EMAIL1]." + }} + + Example 3: + Placeholder list: + [PERSON1]: Dr. Emily Brown + [DATE1]: 1985-03-22 + [HOSPITAL1]: Central Hospital + [PERSON2]: Mr. David Lee + [DATE2]: 2023-06-10 + [TIME1]: 2:30 PM + + Masked text: + [PERSON1], born on [DATE1], works at [HOSPITAL1]. Her patient, [PERSON2], has an appointment on [DATE2] at [TIME1]. + + Output: + {{ + "mapping": {{ + "[PERSON1]": "Dr. Emily Brown", + "[DATE1]": "1985-03-22", + "[HOSPITAL1]": "Central Hospital", + "[PERSON2]": "Mr. David Lee", + "[DATE2]": "2023-06-10", + "[TIME1]": "2:30 PM" + }}, + "masked_text": "[PERSON1], born on [DATE1], works at [HOSPITAL1]. Her patient, [PERSON2], has an appointment on [DATE2] at [TIME1]." + }} + + Example 4: + Placeholder list: + [COMPANY1]: Company XYZ + [PERSON1]: Jane Doe + [COMPANY2]: ABC Corp + [DATE1]: July 1, 2023 + [AMOUNT1]: $500 million + + Masked text: + [COMPANY1]'s CEO, [PERSON1], announced a merger with [COMPANY2] on [DATE1]. The deal is valued at [AMOUNT1]. + + Output: + {{ + "mapping": {{ + "[COMPANY1]": "Company XYZ", + "[PERSON1]": "Jane Doe", + "[COMPANY2]": "ABC Corp", + "[DATE1]": "July 1, 2023", + "[AMOUNT1]": "$500 million" + }}, + "masked_text": "[COMPANY1]'s CEO, [PERSON1], announced a merger with [COMPANY2] on [DATE1]. The deal is valued at [AMOUNT1]." + }} + + Example 5: + Placeholder list: + [CREDITCARD1]: 4111-1111-1111-1111 + [PERSON1]: Michael Johnson + [DATE1]: 12/25 + [CVV1]: 123 + + Masked text: + The credit card number [CREDITCARD1] belongs to [PERSON1], expiring on [DATE1], with CVV [CVV1]. + + Output: + {{ + "mapping": {{ + "[CREDITCARD1]": "4111-1111-1111-1111", + "[PERSON1]": "Michael Johnson", + "[DATE1]": "12/25", + "[CVV1]": "123" + }}, + "masked_text": "The credit card number [CREDITCARD1] belongs to [PERSON1], expiring on [DATE1], with CVV [CVV1]." + }} + + Now, please convert the following masked text and placeholder list into JSON format: + + {response_step1_content} + + ##JSON + """ + } + ] + + response_step2 = self.masking_llm.request(messages_step2, MaskContract) + + return response_step2 + + def unmask_content(self, masked_content: str, mapping: dict) -> str: + for placeholder, original in mapping.items(): + masked_content = masked_content.replace(placeholder, original) + return masked_content def set_document_loader_for_file_type(self, file_type: str, document_loader: DocumentLoader): if self.document_loader is not None: diff --git a/tests/test_process.py b/tests/test_process.py new file mode 100644 index 0000000..6f52a1a --- /dev/null +++ b/tests/test_process.py @@ -0,0 +1,125 @@ +import os +import pytest +from dotenv import load_dotenv + +from extract_thinker.extractor import Extractor +from extract_thinker.process import Process +from extract_thinker.document_loader.document_loader_pypdf import DocumentLoaderPyPdf +from extract_thinker.llm import LLM +import asyncio + +load_dotenv() +cwd = os.getcwd() + +def test_mask(): + # Arrange + test_file_path = os.path.join(cwd, "tests", "files", "invoice.pdf") + + process = Process() + process.load_document_loader(DocumentLoaderPyPdf()) + process.load_file(test_file_path) + process.add_masking_llm("groq/llama-3.2-3b-preview") + + # Act + test_text = ( + "Mr. George Collins lives at 123 Main St, Anytown, USA 12345. His phone number is 555-1234. " + "Jane Smith resides at 456 Elm Avenue, Othercity, State 67890, and can be reached at (987) 654-3210. " + "The company's CEO, Robert Johnson, has an office at 789 Corporate Blvd, Suite 500, Bigcity, State 13579. " + "For customer service, call 1-800-555-9876 or email support@example.com. " + "Sarah Lee, our HR manager, can be contacted at 444-333-2222 or sarah.lee@company.com. " + "The project budget is $250,000, with an additional $50,000 allocated for contingencies. " + "Monthly maintenance costs are estimated at $3,500. " + "For international clients, please use +1-555-987-6543. " + "Our tax ID number is 12-3456789." + ) + + result = asyncio.run(process.mask_content(test_text)) + + # Assert + assert result.masked_text is not None + assert result.mapping is not None + + # Check if all original sensitive information is masked + sensitive_info = [ + "George Collins", "123 Main St", "555-1234", + "Jane Smith", "456 Elm Avenue", "(987) 654-3210", + "Robert Johnson", "789 Corporate Blvd", + "1-800-555-9876", "support@example.com", + "Sarah Lee", "444-333-2222", "sarah.lee@company.com", + "$250,000", "$50,000", "$3,500", + "+1-555-987-6543", "12-3456789" + ] + for info in sensitive_info: + assert info not in result.masked_text, f"{info} was not masked properly" + + # Check if placeholders are present in masked text + placeholder_types = ["NAME", "ADDRESS", "PHONE", "EMAIL"] + assert any(f"[{type}" in result.masked_text for type in placeholder_types), "No expected placeholders found in masked text" + + # Check mapping + assert len(result.mapping) >= 10, "Mapping should contain at least 10 items" + assert all(key.startswith('[') and key.endswith(']') for key in result.mapping.keys()), "Mapping keys should be enclosed in square brackets" + assert all(isinstance(value, str) for value in result.mapping.values()), "Mapping values should be strings" + + # Test unmasking + unmasked_content = process.unmask_content(result.masked_text, result.mapping) + assert "George Collins" in unmasked_content, "Unmasking failed for 'George Collins'" + assert "123 Main St" in unmasked_content, "Unmasking failed for '123 Main St'" + assert "555-1234" in unmasked_content, "Unmasking failed for '555-1234'" + + # Check if all masked content is unmasked + for placeholder, original in result.mapping.items(): + assert original in unmasked_content, f"Unmasking failed for {original}" + assert placeholder not in unmasked_content, f"Placeholder {placeholder} still present in unmasked content" + +def test_mask_invoice(): + # Arrange + test_file_path = os.path.join(cwd, "tests", "files", "invoice.pdf") + + process = Process() + process.load_document_loader(DocumentLoaderPyPdf()) + process.load_file(test_file_path) + llm = LLM("ollama/qwen2.5:3b", "http://localhost:11434") + process.add_masking_llm(llm) + + # Act + content = process.document_loader.load_content_from_file(test_file_path) + # concat all the text from the content list + content = "".join([item for item in content["text"]]) + result = asyncio.run(process.mask_content(content)) + + # Assert + assert result.masked_text is not None + assert result.mapping is not None + + # Check if sensitive information is masked + sensitive_info = [ + "Market Financial Consulting", "450 East 78th Ave", + "Denver, CO 12345", "(123) 456-7890", "(123) 456-7891", + "Gaurav Cheema", "Caneiro Group", "89 Pacific Ave", + "San Francisco, CA 78910", "375.00", "1125.00" + ] + for info in sensitive_info: + assert info not in result.masked_text, f"{info} was not masked properly" + + # Check if placeholders are present in masked text + placeholder_types = ["COMPANY", "ADDRESS", "PHONE", "NAME", "AMOUNT"] + assert any(f"[{type}" in result.masked_text for type in placeholder_types), "No expected placeholders found in masked text" + + # Check mapping + assert len(result.mapping) >= 8, "Mapping should contain at least 8 items" + assert all(key.startswith('[') and key.endswith(']') for key in result.mapping.keys()), "Mapping keys should be enclosed in square brackets" + assert all(isinstance(value, str) for value in result.mapping.values()), "Mapping values should be strings" + + # Test unmasking + unmasked_content = process.unmask_content(result.masked_text, result.mapping) + for info in sensitive_info: + assert info in unmasked_content, f"Unmasking failed for '{info}'" + + # Check if all masked content is unmasked + for placeholder, original in result.mapping.items(): + assert original in unmasked_content, f"Unmasking failed for {original}" + assert placeholder not in unmasked_content, f"Placeholder {placeholder} still present in unmasked content" + +if __name__ == "__main__": + test_mask_invoice() \ No newline at end of file From a2861b00f1954ba3bde4745dfd5c733d9e36ec03 Mon Sep 17 00:00:00 2001 From: julio Almeida Date: Thu, 3 Oct 2024 00:03:09 +0100 Subject: [PATCH 02/12] code functional, but not working well --- .../masking/abstract_masking_strategy.py | 15 ++ .../masking/llm_masking_strategy.py | 105 ++++++++ .../masking/mocked_data_masking_strategy.py | 142 ++++++++++ .../simple_placeholder_masking_strategy.py | 236 +++++++++++++++++ extract_thinker/process.py | 250 ++---------------- tests/test_process.py | 35 +-- 6 files changed, 532 insertions(+), 251 deletions(-) create mode 100644 extract_thinker/masking/abstract_masking_strategy.py create mode 100644 extract_thinker/masking/llm_masking_strategy.py create mode 100644 extract_thinker/masking/mocked_data_masking_strategy.py create mode 100644 extract_thinker/masking/simple_placeholder_masking_strategy.py diff --git a/extract_thinker/masking/abstract_masking_strategy.py b/extract_thinker/masking/abstract_masking_strategy.py new file mode 100644 index 0000000..d96950f --- /dev/null +++ b/extract_thinker/masking/abstract_masking_strategy.py @@ -0,0 +1,15 @@ +from abc import ABC, abstractmethod +from extract_thinker.models.MaskContract import MaskContract +from extract_thinker.llm import LLM + +class AbstractMaskingStrategy(ABC): + def __init__(self, llm: LLM): + self.llm = llm + + @abstractmethod + async def mask_content(self, content: str) -> MaskContract: + pass + + @abstractmethod + def unmask_content(self, masked_content: str, mapping: dict) -> str: + pass \ No newline at end of file diff --git a/extract_thinker/masking/llm_masking_strategy.py b/extract_thinker/masking/llm_masking_strategy.py new file mode 100644 index 0000000..6fee20b --- /dev/null +++ b/extract_thinker/masking/llm_masking_strategy.py @@ -0,0 +1,105 @@ +from extract_thinker.masking.abstract_masking_strategy import AbstractMaskingStrategy +from extract_thinker.models.MaskContract import MaskContract +from extract_thinker.llm import LLM + +class LLMMaskingStrategy(AbstractMaskingStrategy): + async def mask_content(self, content: str) -> MaskContract: + # Step 1: Get masked text and placeholder list + messages_step1 = [ + { + "role": "system", + "content": "You are an AI assistant that masks sensitive information in text." + }, + { + "role": "user", + "content": f""" + Please mask all sensitive information in the following text. Replace sensitive information with placeholders like [PERSON1], [PERSON2], [ADDRESS1], [ADDRESS2], [PHONE1], [PHONE2], etc. Return the masked text and a list of placeholders with their original values. + + Here are some examples: + + Example 1: + Original text: + John Smith lives at 123 Main St, New York, NY 10001. His phone number is (555) 123-4567. + + Placeholder list: + [PERSON1]: John Smith + [ADDRESS1]: 123 Main St, New York, NY 10001 + [PHONE1]: (555) 123-4567 + + Masked text: + [PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]. + + Example 2: + Original text: + Sarah Johnson ordered a laptop from TechStore on 2023-05-15. Her email is sarah.j@email.com. + + Placeholder list: + [PERSON1]: Sarah Johnson + [PRODUCT1]: laptop + [STORE1]: TechStore + [DATE1]: 2023-05-15 + [EMAIL1]: sarah.j@email.com + + Masked text: + [PERSON1] ordered a [PRODUCT1] from [STORE1] on [DATE1]. Her email is [EMAIL1]. + + Now, please mask the following text: + + Text to mask: + {content} + + Give me the placeholder list with the value and respective placeholder, and then the Masked text with the placeholders. + """ + } + ] + + response_step1 = self.llm.request(messages_step1) + response_step1_content = response_step1.choices[0].message.content + + # Step 2: Convert to JSON format + messages_step2 = [ + { + "role": "system", + "content": "You are an AI assistant that converts masked text information into JSON format." + }, + { + "role": "user", + "content": f""" + Convert the following masked texts and placeholder lists into a JSON format. For each example, the JSON should have two main keys: "mapping" (a dictionary of placeholders and their original values) and "masked_text" (the text with placeholders). + Always use [], not "" or '' + Make sure that masked_text contains no sensitive information, only the placeholders. + + Example 1: + Placeholder list: + [PERSON1]: John Smith + [ADDRESS1]: 123 Main St, New York, NY 10001 + [PHONE1]: (555) 123-4567 + + Masked text: + [PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]. + + Output: + {{ + "mapping": {{ + "[PERSON1]": "John Smith", + "[ADDRESS1]": "123 Main St, New York, NY 10001", + "[PHONE1]": "(555) 123-4567" + }}, + "masked_text": "[PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]." + }} + + Now, please convert the following masked text and placeholder list into JSON format: + + {response_step1_content} + + ##JSON + """ + } + ] + + return self.llm.request(messages_step2, MaskContract) + + def unmask_content(self, masked_content: str, mapping: dict) -> str: + for placeholder, original in mapping.items(): + masked_content = masked_content.replace(placeholder, original) + return masked_content \ No newline at end of file diff --git a/extract_thinker/masking/mocked_data_masking_strategy.py b/extract_thinker/masking/mocked_data_masking_strategy.py new file mode 100644 index 0000000..ad201b4 --- /dev/null +++ b/extract_thinker/masking/mocked_data_masking_strategy.py @@ -0,0 +1,142 @@ +import re +from extract_thinker.llm import LLM +from extract_thinker.masking.abstract_masking_strategy import AbstractMaskingStrategy +from extract_thinker.models.MaskContract import MaskContract + +class MockedDataMaskingStrategy(AbstractMaskingStrategy): + def __init__(self, llm: LLM): + super().__init__(llm) + + async def mask_content(self, content: str) -> MaskContract: + # Step 1: Get masked text and mocked data mapping + messages_step1 = [ + { + "role": "system", + "content": "You are an AI assistant that masks sensitive information in text with mocked data." + }, + { + "role": "user", + "content": f""" + Please mask all sensitive information in the following text with mocked data. Replace sensitive information with realistic but fake data. Return the masked text and a mapping of original values to mocked data. + + Here are some examples: + + Example 1: + Original text: + John Smith lives at 123 Main St, New York, NY 10001. His phone number is (555) 123-4567. + + Mocked data mapping: + "John Smith": "Michael Johnson" + "123 Main St, New York, NY 10001": "456 Oak Ave, Chicago, IL 60601" + "(555) 123-4567": "(312) 555-7890" + + Masked text: + Michael Johnson lives at 456 Oak Ave, Chicago, IL 60601. His phone number is (312) 555-7890. + + Example 2: + Original text: + Sarah Johnson ordered a laptop from TechStore on 2023-05-15. Her email is sarah.j@email.com. + + Mocked data mapping: + "Sarah Johnson": "Emma Thompson" + "laptop": "tablet" + "TechStore": "GadgetWorld" + "2023-05-15": "2023-06-22" + "sarah.j@email.com": "emma.t@fakemail.com" + + Masked text: + Emma Thompson ordered a tablet from GadgetWorld on 2023-06-22. Her email is emma.t@fakemail.com. + + Now, please mask the following text: + + Text to mask: + {content} + + Give me the mocked data mapping with the original value and respective mocked data, and then the Masked text with the mocked data. + """ + } + ] + + response_step1 = self.llm.request(messages_step1) + + response_step1_content = response_step1.choices[0].message.content + + # Step 2: Convert to JSON format + messages_step2 = [ + { + "role": "system", + "content": "You are an AI assistant that converts masked text information into JSON format." + }, + { + "role": "user", + "content": f""" + Convert the following masked texts and mocked data mappings into a JSON format. For each example, the JSON should have two main keys: "mapping" (a dictionary of original values and their mocked data) and "masked_text" (the text with mocked data). + + Example 1: + Mocked data mapping: + "John Smith": "Michael Johnson" + "123 Main St, New York, NY 10001": "456 Oak Ave, Chicago, IL 60601" + "(555) 123-4567": "(312) 555-7890" + + Masked text: + Michael Johnson lives at 456 Oak Ave, Chicago, IL 60601. His phone number is (312) 555-7890. + + Output: + {{ + "mapping": {{ + "John Smith": "Michael Johnson", + "123 Main St, New York, NY 10001": "456 Oak Ave, Chicago, IL 60601", + "(555) 123-4567": "(312) 555-7890" + }}, + "masked_text": "Michael Johnson lives at 456 Oak Ave, Chicago, IL 60601. His phone number is (312) 555-7890." + }} + + Example 2: + Mocked data mapping: + "Sarah Johnson": "Emma Thompson" + "laptop": "tablet" + "TechStore": "GadgetWorld" + "2023-05-15": "2023-06-22" + "sarah.j@email.com": "emma.t@fakemail.com" + + Masked text: + Emma Thompson ordered a tablet from GadgetWorld on 2023-06-22. Her email is emma.t@fakemail.com. + + Output: + {{ + "mapping": {{ + "Sarah Johnson": "Emma Thompson", + "laptop": "tablet", + "TechStore": "GadgetWorld", + "2023-05-15": "2023-06-22", + "sarah.j@email.com": "emma.t@fakemail.com" + }}, + "masked_text": "Emma Thompson ordered a tablet from GadgetWorld on 2023-06-22. Her email is emma.t@fakemail.com." + }} + + Now, please convert the following masked text and mocked data mapping into JSON format: + + {response_step1_content} + + ##JSON + """ + } + ] + + response_step2 = self.llm.request(messages_step2, MaskContract) + + masked_text = response_step2.masked_text + mapping = response_step2.mapping + + for original, mocked in mapping.items(): + if original in masked_text: + masked_text = masked_text.replace(original, mocked) + + response_step2.masked_text = masked_text + + return response_step2 + + def unmask_content(self, masked_content: str, mapping: dict) -> str: + for mocked, original in mapping.items(): + masked_content = masked_content.replace(mocked, original) + return masked_content \ No newline at end of file diff --git a/extract_thinker/masking/simple_placeholder_masking_strategy.py b/extract_thinker/masking/simple_placeholder_masking_strategy.py new file mode 100644 index 0000000..0250a1e --- /dev/null +++ b/extract_thinker/masking/simple_placeholder_masking_strategy.py @@ -0,0 +1,236 @@ +import re +from extract_thinker.llm import LLM +from extract_thinker.masking.abstract_masking_strategy import AbstractMaskingStrategy +from extract_thinker.models.MaskContract import MaskContract + +class SimplePlaceholderMaskingStrategy(AbstractMaskingStrategy): + def __init__(self, llm: LLM): + super().__init__(llm) + self.placeholder_counter = {} + + async def mask_content(self, content: str) -> MaskContract: + # Step 1: Get masked text and placeholder list + messages_step1 = [ + { + "role": "system", + "content": "You are an AI assistant that masks sensitive information in text." + }, + { + "role": "user", + "content": f""" + Please mask all sensitive information in the following text. Replace sensitive information with placeholders like [PERSON1], [PERSON2], [ADDRESS1], [ADDRESS2], [PHONE1], [PHONE2], etc. Return the masked text and a list of placeholders with their original values. + + Here are some examples: + + Example 1: + Original text: + John Smith lives at 123 Main St, New York, NY 10001. His phone number is (555) 123-4567. + + Placeholder list: + [PERSON1]: John Smith + [ADDRESS1]: 123 Main St, New York, NY 10001 + [PHONE1]: (555) 123-4567 + + Masked text: + [PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]. + + Example 2: + Original text: + Sarah Johnson ordered a laptop from TechStore on 2023-05-15. Her email is sarah.j@email.com. + + Placeholder list: + [PERSON1]: Sarah Johnson + [PRODUCT1]: laptop + [STORE1]: TechStore + [DATE1]: 2023-05-15 + [EMAIL1]: sarah.j@email.com + + Masked text: + [PERSON1] ordered a [PRODUCT1] from [STORE1] on [DATE1]. Her email is [EMAIL1]. + + Example 3: + Original text: + Dr. Emily Brown, born on 1985-03-22, works at Central Hospital. Her patient, Mr. David Lee, has an appointment on 2023-06-10 at 2:30 PM. + + Placeholder list: + [PERSON1]: Dr. Emily Brown + [DATE1]: 1985-03-22 + [HOSPITAL1]: Central Hospital + [PERSON2]: Mr. David Lee + [DATE2]: 2023-06-10 + [TIME1]: 2:30 PM + + Masked text: + [PERSON1], born on [DATE1], works at [HOSPITAL1]. Her patient, [PERSON2], has an appointment on [DATE2] at [TIME1]. + + Now, please mask the following text: + + Text to mask: + {content} + + Give me the placeholder list with the value and respective placeholder, and then the Masked text with the placeholders. + """ + } + ] + + response_step1 = self.llm.request(messages_step1) + + response_step1_content = response_step1.choices[0].message.content + + # Step 2: Convert to JSON format + messages_step2 = [ + { + "role": "system", + "content": "You are an AI assistant that converts masked text information into JSON format." + }, + { + "role": "user", + "content": f""" + Convert the following masked texts and placeholder lists into a JSON format. For each example, the JSON should have two main keys: "mapping" (a dictionary of placeholders and their original values) and "masked_text" (the text with placeholders). + Always use [], not "" or '' + You can only have one placeholder for each type and vice versa. + Make sure that masked_text contains no sensitive information, only the placeholders. + + Example 1: + Placeholder list: + [PERSON1]: John Smith + [ADDRESS1]: 123 Main St, New York, NY 10001 + [PHONE1]: (555) 123-4567 + + Masked text: + [PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]. + + Output: + {{ + "mapping": {{ + "[PERSON1]": "John Smith", + "[ADDRESS1]": "123 Main St, New York, NY 10001", + "[PHONE1]": "(555) 123-4567" + }}, + "masked_text": "[PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]." + }} + + Example 2: + Placeholder list: + [PERSON1]: Sarah Johnson + [PRODUCT1]: laptop + [STORE1]: TechStore + [DATE1]: 2023-05-15 + [EMAIL1]: sarah.j@email.com + + Masked text: + [PERSON1] ordered a [PRODUCT1] from [STORE1] on [DATE1]. Her email is [EMAIL1]. + + Output: + {{ + "mapping": {{ + "[PERSON1]": "Sarah Johnson", + "[PRODUCT1]": "laptop", + "[STORE1]": "TechStore", + "[DATE1]": "2023-05-15", + "[EMAIL1]": "sarah.j@email.com" + }}, + "masked_text": "[PERSON1] ordered a [PRODUCT1] from [STORE1] on [DATE1]. Her email is [EMAIL1]." + }} + + Example 3: + Placeholder list: + [PERSON1]: Dr. Emily Brown + [DATE1]: 1985-03-22 + [HOSPITAL1]: Central Hospital + [PERSON2]: Mr. David Lee + [DATE2]: 2023-06-10 + [TIME1]: 2:30 PM + + Masked text: + [PERSON1], born on [DATE1], works at [HOSPITAL1]. Her patient, [PERSON2], has an appointment on [DATE2] at [TIME1]. + + Output: + {{ + "mapping": {{ + "[PERSON1]": "Dr. Emily Brown", + "[DATE1]": "1985-03-22", + "[HOSPITAL1]": "Central Hospital", + "[PERSON2]": "Mr. David Lee", + "[DATE2]": "2023-06-10", + "[TIME1]": "2:30 PM" + }}, + "masked_text": "[PERSON1], born on [DATE1], works at [HOSPITAL1]. Her patient, [PERSON2], has an appointment on [DATE2] at [TIME1]." + }} + + Example 4: + Placeholder list: + [COMPANY1]: Company XYZ + [PERSON1]: Jane Doe + [COMPANY2]: ABC Corp + [DATE1]: July 1, 2023 + [AMOUNT1]: $500 million + + Masked text: + [COMPANY1]'s CEO, [PERSON1], announced a merger with [COMPANY2] on [DATE1]. The deal is valued at [AMOUNT1]. + + Output: + {{ + "mapping": {{ + "[COMPANY1]": "Company XYZ", + "[PERSON1]": "Jane Doe", + "[COMPANY2]": "ABC Corp", + "[DATE1]": "July 1, 2023", + "[AMOUNT1]": "$500 million" + }}, + "masked_text": "[COMPANY1]'s CEO, [PERSON1], announced a merger with [COMPANY2] on [DATE1]. The deal is valued at [AMOUNT1]." + }} + + Example 5: + Placeholder list: + [CREDITCARD1]: 4111-1111-1111-1111 + [PERSON1]: Michael Johnson + [DATE1]: 12/25 + [CVV1]: 123 + + Masked text: + The credit card number [CREDITCARD1] belongs to [PERSON1], expiring on [DATE1], with CVV [CVV1]. + + Output: + {{ + "mapping": {{ + "[CREDITCARD1]": "4111-1111-1111-1111", + "[PERSON1]": "Michael Johnson", + "[DATE1]": "12/25", + "[CVV1]": "123" + }}, + "masked_text": "The credit card number [CREDITCARD1] belongs to [PERSON1], expiring on [DATE1], with CVV [CVV1]." + }} + + Now, please convert the following masked text and placeholder list into JSON format: + + {response_step1_content} + + ##JSON + """ + } + ] + + response_step2 = self.llm.request(messages_step2, MaskContract) + + masked_text = response_step2.masked_text + mapping = response_step2.mapping + + for placeholder, value in mapping.items(): + if value in masked_text: + masked_text = masked_text.replace(value, placeholder) + + response_step2.masked_text = masked_text + + return response_step2 + + def get_placeholder(self, info_type): + if info_type not in self.placeholder_counter: + self.placeholder_counter[info_type] = 0 + self.placeholder_counter[info_type] += 1 + return f"[{info_type}{self.placeholder_counter[info_type]}]" + + def unmask_content(self, masked_content: str, mapping: dict) -> str: + for placeholder, original in mapping.items(): + masked_content = masked_content.replace(placeholder, original) + return masked_content \ No newline at end of file diff --git a/extract_thinker/process.py b/extract_thinker/process.py index 6a5ff86..6875fad 100644 --- a/extract_thinker/process.py +++ b/extract_thinker/process.py @@ -15,12 +15,19 @@ from extract_thinker.llm import LLM from extract_thinker.models.MaskContract import MaskContract from enum import Enum +from extract_thinker.masking.llm_masking_strategy import LLMMaskingStrategy +from extract_thinker.masking.simple_placeholder_masking_strategy import SimplePlaceholderMaskingStrategy +from extract_thinker.masking.mocked_data_masking_strategy import MockedDataMaskingStrategy +from extract_thinker.masking.abstract_masking_strategy import AbstractMaskingStrategy class ClassificationStrategy(Enum): CONSENSUS = "consensus" HIGHER_ORDER = "higher_order" CONSENSUS_WITH_THRESHOLD = "both" +class MaskingStrategy(Enum): + SIMPLE_PLACEHOLDER = "simple_placeholder" + MOCKED_DATA = "mocked_data" class Process: def __init__(self): @@ -33,232 +40,31 @@ def __init__(self): self.file_path: Optional[str] = None self.file_stream: Optional[IO] = None self.splitter: Optional[Splitter] = None - self.masking_llm: Optional[LLM] = None - self.masking_enabled: bool = False - - def add_masking_llm(self, model: Optional[str] = None) -> None: - self.masking_enabled = True + self.masking_strategy: Optional[AbstractMaskingStrategy] = None + self.llm: Optional[LLM] = None + def add_masking_llm(self, model: Optional[str] = None, strategy: Optional[MaskContract] = MaskingStrategy.SIMPLE_PLACEHOLDER): if isinstance(model, LLM): - self.masking_llm = model + self.llm = model elif model is not None: - self.masking_llm = LLM(model) - else: - raise ValueError("Either a model string or an LLM object must be provided.") - + self.llm = LLM(model) + + if strategy == MaskingStrategy.SIMPLE_PLACEHOLDER: + self.masking_strategy = SimplePlaceholderMaskingStrategy(self.llm) + elif strategy == MaskingStrategy.MOCKED_DATA: + self.masking_strategy = MockedDataMaskingStrategy(self.llm) + async def mask_content(self, content: str) -> MaskContract: - if not self.masking_enabled or not self.masking_llm: - raise ValueError("Masking is not enabled, please set a masking llm with add_masking_llm") - - # Step 1: Get masked text and placeholder list - messages_step1 = [ - { - "role": "system", - "content": "You are an AI assistant that masks sensitive information in text." - }, - { - "role": "user", - "content": f""" - Please mask all sensitive information in the following text. Replace sensitive information with placeholders like [PERSON1], [PERSON2], [ADDRESS1], [ADDRESS2], [PHONE1], [PHONE2], etc. Return the masked text and a list of placeholders with their original values. - - Here are some examples: - - Example 1: - Original text: - John Smith lives at 123 Main St, New York, NY 10001. His phone number is (555) 123-4567. - - Placeholder list: - [PERSON1]: John Smith - [ADDRESS1]: 123 Main St, New York, NY 10001 - [PHONE1]: (555) 123-4567 - - Masked text: - [PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]. - - Example 2: - Original text: - Sarah Johnson ordered a laptop from TechStore on 2023-05-15. Her email is sarah.j@email.com. - - Placeholder list: - [PERSON1]: Sarah Johnson - [PRODUCT1]: laptop - [STORE1]: TechStore - [DATE1]: 2023-05-15 - [EMAIL1]: sarah.j@email.com - - Masked text: - [PERSON1] ordered a [PRODUCT1] from [STORE1] on [DATE1]. Her email is [EMAIL1]. - - Example 3: - Original text: - Dr. Emily Brown, born on 1985-03-22, works at Central Hospital. Her patient, Mr. David Lee, has an appointment on 2023-06-10 at 2:30 PM. - - Placeholder list: - [PERSON1]: Dr. Emily Brown - [DATE1]: 1985-03-22 - [HOSPITAL1]: Central Hospital - [PERSON2]: Mr. David Lee - [DATE2]: 2023-06-10 - [TIME1]: 2:30 PM - - Masked text: - [PERSON1], born on [DATE1], works at [HOSPITAL1]. Her patient, [PERSON2], has an appointment on [DATE2] at [TIME1]. - - Now, please mask the following text: - - Text to mask: - {content} - - Give me the placeholder list with the value and respective placeholder, and then the Masked text with the placeholders. - """ - } - ] - - response_step1 = self.masking_llm.request(messages_step1) - - response_step1_content = response_step1.choices[0].message.content - - # Step 2: Convert to JSON format - messages_step2 = [ - { - "role": "system", - "content": "You are an AI assistant that converts masked text information into JSON format." - }, - { - "role": "user", - "content": f""" - Convert the following masked texts and placeholder lists into a JSON format. For each example, the JSON should have two main keys: "mapping" (a dictionary of placeholders and their original values) and "masked_text" (the text with placeholders). - Always use [], not "" or '' - Make sure that masked_text contains no sensitive information, only the placeholders. - - Example 1: - Placeholder list: - [PERSON1]: John Smith - [ADDRESS1]: 123 Main St, New York, NY 10001 - [PHONE1]: (555) 123-4567 - - Masked text: - [PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]. - - Output: - {{ - "mapping": {{ - "[PERSON1]": "John Smith", - "[ADDRESS1]": "123 Main St, New York, NY 10001", - "[PHONE1]": "(555) 123-4567" - }}, - "masked_text": "[PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]." - }} - - Example 2: - Placeholder list: - [PERSON1]: Sarah Johnson - [PRODUCT1]: laptop - [STORE1]: TechStore - [DATE1]: 2023-05-15 - [EMAIL1]: sarah.j@email.com - - Masked text: - [PERSON1] ordered a [PRODUCT1] from [STORE1] on [DATE1]. Her email is [EMAIL1]. - - Output: - {{ - "mapping": {{ - "[PERSON1]": "Sarah Johnson", - "[PRODUCT1]": "laptop", - "[STORE1]": "TechStore", - "[DATE1]": "2023-05-15", - "[EMAIL1]": "sarah.j@email.com" - }}, - "masked_text": "[PERSON1] ordered a [PRODUCT1] from [STORE1] on [DATE1]. Her email is [EMAIL1]." - }} - - Example 3: - Placeholder list: - [PERSON1]: Dr. Emily Brown - [DATE1]: 1985-03-22 - [HOSPITAL1]: Central Hospital - [PERSON2]: Mr. David Lee - [DATE2]: 2023-06-10 - [TIME1]: 2:30 PM - - Masked text: - [PERSON1], born on [DATE1], works at [HOSPITAL1]. Her patient, [PERSON2], has an appointment on [DATE2] at [TIME1]. - - Output: - {{ - "mapping": {{ - "[PERSON1]": "Dr. Emily Brown", - "[DATE1]": "1985-03-22", - "[HOSPITAL1]": "Central Hospital", - "[PERSON2]": "Mr. David Lee", - "[DATE2]": "2023-06-10", - "[TIME1]": "2:30 PM" - }}, - "masked_text": "[PERSON1], born on [DATE1], works at [HOSPITAL1]. Her patient, [PERSON2], has an appointment on [DATE2] at [TIME1]." - }} - - Example 4: - Placeholder list: - [COMPANY1]: Company XYZ - [PERSON1]: Jane Doe - [COMPANY2]: ABC Corp - [DATE1]: July 1, 2023 - [AMOUNT1]: $500 million - - Masked text: - [COMPANY1]'s CEO, [PERSON1], announced a merger with [COMPANY2] on [DATE1]. The deal is valued at [AMOUNT1]. - - Output: - {{ - "mapping": {{ - "[COMPANY1]": "Company XYZ", - "[PERSON1]": "Jane Doe", - "[COMPANY2]": "ABC Corp", - "[DATE1]": "July 1, 2023", - "[AMOUNT1]": "$500 million" - }}, - "masked_text": "[COMPANY1]'s CEO, [PERSON1], announced a merger with [COMPANY2] on [DATE1]. The deal is valued at [AMOUNT1]." - }} - - Example 5: - Placeholder list: - [CREDITCARD1]: 4111-1111-1111-1111 - [PERSON1]: Michael Johnson - [DATE1]: 12/25 - [CVV1]: 123 - - Masked text: - The credit card number [CREDITCARD1] belongs to [PERSON1], expiring on [DATE1], with CVV [CVV1]. - - Output: - {{ - "mapping": {{ - "[CREDITCARD1]": "4111-1111-1111-1111", - "[PERSON1]": "Michael Johnson", - "[DATE1]": "12/25", - "[CVV1]": "123" - }}, - "masked_text": "The credit card number [CREDITCARD1] belongs to [PERSON1], expiring on [DATE1], with CVV [CVV1]." - }} - - Now, please convert the following masked text and placeholder list into JSON format: - - {response_step1_content} - - ##JSON - """ - } - ] - - response_step2 = self.masking_llm.request(messages_step2, MaskContract) - - return response_step2 - + if self.masking_strategy is None: + raise ValueError("No masking strategy has been set. Please set a masking strategy with add_masking_strategy.") + + return await self.masking_strategy.mask_content(content) + def unmask_content(self, masked_content: str, mapping: dict) -> str: - for placeholder, original in mapping.items(): - masked_content = masked_content.replace(placeholder, original) - return masked_content + if self.masking_strategy is None: + raise ValueError("No masking strategy has been set. Please set a masking strategy with add_masking_strategy.") + + return self.masking_strategy.unmask_content(masked_content, mapping) def set_document_loader_for_file_type(self, file_type: str, document_loader: DocumentLoader): if self.document_loader is not None: @@ -524,4 +330,4 @@ async def process_doc_groups(groups: List[Any]) -> List[Any]: process_doc_groups(doc_groups) ) - return processedGroups + return processedGroups \ No newline at end of file diff --git a/tests/test_process.py b/tests/test_process.py index 6f52a1a..d9cb5e4 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -3,7 +3,7 @@ from dotenv import load_dotenv from extract_thinker.extractor import Extractor -from extract_thinker.process import Process +from extract_thinker.process import MaskingStrategy, Process from extract_thinker.document_loader.document_loader_pypdf import DocumentLoaderPyPdf from extract_thinker.llm import LLM import asyncio @@ -79,8 +79,8 @@ def test_mask_invoice(): process = Process() process.load_document_loader(DocumentLoaderPyPdf()) process.load_file(test_file_path) - llm = LLM("ollama/qwen2.5:3b", "http://localhost:11434") - process.add_masking_llm(llm) + llm = LLM("groq/llama-3.2-3b-preview") + process.add_masking_llm(llm, MaskingStrategy.MOCKED_DATA) # Act content = process.document_loader.load_content_from_file(test_file_path) @@ -92,34 +92,11 @@ def test_mask_invoice(): assert result.masked_text is not None assert result.mapping is not None - # Check if sensitive information is masked - sensitive_info = [ - "Market Financial Consulting", "450 East 78th Ave", - "Denver, CO 12345", "(123) 456-7890", "(123) 456-7891", - "Gaurav Cheema", "Caneiro Group", "89 Pacific Ave", - "San Francisco, CA 78910", "375.00", "1125.00" - ] - for info in sensitive_info: - assert info not in result.masked_text, f"{info} was not masked properly" - - # Check if placeholders are present in masked text - placeholder_types = ["COMPANY", "ADDRESS", "PHONE", "NAME", "AMOUNT"] - assert any(f"[{type}" in result.masked_text for type in placeholder_types), "No expected placeholders found in masked text" - - # Check mapping - assert len(result.mapping) >= 8, "Mapping should contain at least 8 items" - assert all(key.startswith('[') and key.endswith(']') for key in result.mapping.keys()), "Mapping keys should be enclosed in square brackets" - assert all(isinstance(value, str) for value in result.mapping.values()), "Mapping values should be strings" - # Test unmasking unmasked_content = process.unmask_content(result.masked_text, result.mapping) - for info in sensitive_info: - assert info in unmasked_content, f"Unmasking failed for '{info}'" - - # Check if all masked content is unmasked - for placeholder, original in result.mapping.items(): - assert original in unmasked_content, f"Unmasking failed for {original}" - assert placeholder not in unmasked_content, f"Placeholder {placeholder} still present in unmasked content" + + # Check if all masked content is the same as the original content + assert content == unmasked_content, "Unmasked content does not match the original content" if __name__ == "__main__": test_mask_invoice() \ No newline at end of file From a56935875096e900863bb491de5fac515a22c22e Mon Sep 17 00:00:00 2001 From: julio Almeida Date: Thu, 3 Oct 2024 00:39:14 +0100 Subject: [PATCH 03/12] simple setup for the article --- .../masking/mocked_data_masking_strategy.py | 1 + .../simple_placeholder_masking_strategy.py | 5 +- tests/test_process.py | 97 ++++++++++++++----- 3 files changed, 75 insertions(+), 28 deletions(-) diff --git a/extract_thinker/masking/mocked_data_masking_strategy.py b/extract_thinker/masking/mocked_data_masking_strategy.py index ad201b4..d62309f 100644 --- a/extract_thinker/masking/mocked_data_masking_strategy.py +++ b/extract_thinker/masking/mocked_data_masking_strategy.py @@ -18,6 +18,7 @@ async def mask_content(self, content: str) -> MaskContract: "role": "user", "content": f""" Please mask all sensitive information in the following text with mocked data. Replace sensitive information with realistic but fake data. Return the masked text and a mapping of original values to mocked data. + - Keep all values, doesnt constitute sensitive information Here are some examples: diff --git a/extract_thinker/masking/simple_placeholder_masking_strategy.py b/extract_thinker/masking/simple_placeholder_masking_strategy.py index 0250a1e..7f38564 100644 --- a/extract_thinker/masking/simple_placeholder_masking_strategy.py +++ b/extract_thinker/masking/simple_placeholder_masking_strategy.py @@ -19,7 +19,7 @@ async def mask_content(self, content: str) -> MaskContract: "role": "user", "content": f""" Please mask all sensitive information in the following text. Replace sensitive information with placeholders like [PERSON1], [PERSON2], [ADDRESS1], [ADDRESS2], [PHONE1], [PHONE2], etc. Return the masked text and a list of placeholders with their original values. - + - Keep all values, doesnt constitute sensitive information Here are some examples: Example 1: @@ -90,7 +90,8 @@ async def mask_content(self, content: str) -> MaskContract: Always use [], not "" or '' You can only have one placeholder for each type and vice versa. Make sure that masked_text contains no sensitive information, only the placeholders. - + Keep all values, doesnt constitute sensitive information + Example 1: Placeholder list: [PERSON1]: John Smith diff --git a/tests/test_process.py b/tests/test_process.py index d9cb5e4..15b5fc2 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -72,31 +72,76 @@ def test_mask(): assert original in unmasked_content, f"Unmasking failed for {original}" assert placeholder not in unmasked_content, f"Placeholder {placeholder} still present in unmasked content" -def test_mask_invoice(): - # Arrange - test_file_path = os.path.join(cwd, "tests", "files", "invoice.pdf") - - process = Process() - process.load_document_loader(DocumentLoaderPyPdf()) - process.load_file(test_file_path) - llm = LLM("groq/llama-3.2-3b-preview") - process.add_masking_llm(llm, MaskingStrategy.MOCKED_DATA) - - # Act - content = process.document_loader.load_content_from_file(test_file_path) - # concat all the text from the content list - content = "".join([item for item in content["text"]]) - result = asyncio.run(process.mask_content(content)) - - # Assert - assert result.masked_text is not None - assert result.mapping is not None - - # Test unmasking - unmasked_content = process.unmask_content(result.masked_text, result.mapping) - - # Check if all masked content is the same as the original content - assert content == unmasked_content, "Unmasked content does not match the original content" +# def test_mask_invoice(): +# # Arrange +# test_file_path = os.path.join(cwd, "tests", "files", "invoice.pdf") + +# process = Process() +# process.load_document_loader(DocumentLoaderPyPdf()) +# process.load_file(test_file_path) +# process.add_masking_llm("groq/llama-3.2-11b-text-preview") + +# # Act +# test_text = ( +# "Market Financial Consulting Experts in earning trusts 450 East 78th Ave Denver, CO 12345 " +# "Phone : (123) 456 -7890 Fax: (123) 456 -7891 INVOICE INVOICE: 00012 DATE: 1/30/23 " +# "TO: Gaurav Cheema Caneiro Group 89 Pacific Ave San Francisco, CA 78910 " +# "FOR: Consultation services DESCRIPTION HOURS RATE AMOUNT " +# "Consultation services 3.0 375.00 1125.00 " +# "TOTAL 1125.00 Make all checks payable to Market Financial Consulting " +# "Total due in 15 days. Overdue accounts subject to a service charge of 1% per month. " +# "THANK YOU FOR YOUR BUSINESS!" +# ) + +# result = asyncio.run(process.mask_content(test_text)) + +# # Assert +# assert result.masked_text is not None +# assert result.mapping is not None + +# # Check if all original sensitive information is masked +# sensitive_info = [ +# "Market Financial Consulting", "450 East 78th Ave", "Denver, CO 12345", +# "(123) 456 -7890", "(123) 456 -7891", +# "Gaurav Cheema", "Caneiro Group", "89 Pacific Ave", "San Francisco, CA 78910" +# ] +# for info in sensitive_info: +# assert info not in result.masked_text, f"{info} was not masked properly" + +# # Check if placeholders are present in masked text +# placeholder_types = ["COMPANY", "ADDRESS", "PHONE", "NAME"] +# assert any(f"[{type}" in result.masked_text for type in placeholder_types), "No expected placeholders found in masked text" + +# # Check if non-sensitive information is still present +# non_sensitive_info = [ +# "INVOICE", "DATE", "FOR: Consultation services", +# "DESCRIPTION", "HOURS", "RATE", "AMOUNT", +# "3.0", "375.00", "1125.00", "TOTAL", +# "Make all checks payable to", +# "Total due in 15 days. Overdue accounts subject to a service charge of 1% per month.", +# "THANK YOU FOR YOUR BUSINESS!" +# ] +# for info in non_sensitive_info: +# assert info in result.masked_text, f"{info} was unexpectedly masked" + +# # Check mapping +# assert len(result.mapping) >= 5, "Mapping should contain at least 5 items" +# assert all(key.startswith('[') and key.endswith(']') for key in result.mapping.keys()), "Mapping keys should be enclosed in square brackets" +# assert all(isinstance(value, str) for value in result.mapping.values()), "Mapping values should be strings" + +# # Test unmasking +# unmasked_content = process.unmask_content(result.masked_text, result.mapping) +# assert "Market Financial Consulting" in unmasked_content, "Unmasking failed for 'Market Financial Consulting'" +# assert "450 East 78th Ave" in unmasked_content, "Unmasking failed for '450 East 78th Ave'" +# assert "Gaurav Cheema" in unmasked_content, "Unmasking failed for 'Gaurav Cheema'" + +# # Check if all masked content is unmasked +# for placeholder, original in result.mapping.items(): +# assert original in unmasked_content, f"Unmasking failed for {original}" +# assert placeholder not in unmasked_content, f"Placeholder {placeholder} still present in unmasked content" + +# # Check if all masked content is the same as the original content +# assert test_text == unmasked_content, "Unmasked content does not match the original content" if __name__ == "__main__": - test_mask_invoice() \ No newline at end of file + test_mask() \ No newline at end of file From 4f06bd0fff1e599ec56a18c051b5f29b93beac3f Mon Sep 17 00:00:00 2001 From: julio Almeida Date: Tue, 22 Oct 2024 16:05:57 +0200 Subject: [PATCH 04/12] new version of simple masking --- .../simple_placeholder_masking_strategy.py | 332 +++++++----------- tests/test_process.py | 117 ++++-- 2 files changed, 220 insertions(+), 229 deletions(-) diff --git a/extract_thinker/masking/simple_placeholder_masking_strategy.py b/extract_thinker/masking/simple_placeholder_masking_strategy.py index 7f38564..60b0888 100644 --- a/extract_thinker/masking/simple_placeholder_masking_strategy.py +++ b/extract_thinker/masking/simple_placeholder_masking_strategy.py @@ -3,226 +3,162 @@ from extract_thinker.masking.abstract_masking_strategy import AbstractMaskingStrategy from extract_thinker.models.MaskContract import MaskContract + class SimplePlaceholderMaskingStrategy(AbstractMaskingStrategy): + MASK_PII_PROMPT = ( + "You are an AI assistant that masks only Personally Identifiable Information (PII) in text. " + "Do not mask numerical values or non-PII data." + ) + + MASK_PII_USER_PROMPT = """Please mask all PII in the following text. Replace PII with placeholders like [PERSON1], [ADDRESS1], [EMAIL1], etc. Do not mask numerical values, dates, amounts, or other non-PII information. Return the masked text and a list of placeholders with their original values. + +Here are some examples: + +Example 1: +Original text: +John Smith lives at 123 Main St, New York, NY 10001. His phone number is (555) 123-4567. He deposited $5,000 on 2023-07-15. + +Placeholder list: +[PERSON1]: John Smith +[ADDRESS1]: 123 Main St, New York, NY 10001 +[PHONE1]: (555) 123-4567 + +Masked text: +[PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]. He deposited $5,000 on 2023-07-15. + +Example 2: +Original text: +Sarah Johnson ordered a laptop from TechStore on 2023-05-15. Her email is sarah.j@email.com. The total amount was $1,200. + +Placeholder list: +[PERSON1]: Sarah Johnson +[EMAIL1]: sarah.j@email.com + +Masked text: +[PERSON1] ordered a laptop from TechStore on 2023-05-15. Her email is [EMAIL1]. The total amount was $1,200. + +Example 3: +Original text: +Dr. Emily Brown, born on 1985-03-22, works at Central Hospital. Her patient, Mr. David Lee, has an appointment on 2023-06-10 at 2:30 PM. The procedure costs $3,500. + +Placeholder list: +[PERSON1]: Dr. Emily Brown +[PERSON2]: Mr. David Lee + +Masked text: +[PERSON1], born on 1985-03-22, works at Central Hospital. Her patient, [PERSON2], has an appointment on 2023-06-10 at 2:30 PM. The procedure costs $3,500. + +Now, please mask the following text: + +Text to mask: +{content} + +Provide the placeholder list with their original values, followed by the masked text. +""" + + CONVERT_TO_JSON_PROMPT = ( + "You are an AI assistant that converts masked text information into JSON format, " + "preserving only the masking for PII." + ) + + CONVERT_TO_JSON_USER_PROMPT = """Convert the following masked texts and placeholder lists into a JSON format. For each example, the JSON should have two main keys: "mapping" (a dictionary of placeholders and their original PII values) and "masked_text" (the text with PII placeholders). Do not include non-PII information such as numerical values, dates, or amounts. + +Example 1: +Placeholder list: +[PERSON1]: John Smith +[ADDRESS1]: 123 Main St, New York, NY 10001 +[PHONE1]: (555) 123-4567 + +Masked text: +[PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]. He deposited $5,000 on 2023-07-15. + +Output: +{{ + "mapping": {{ + "[PERSON1]": "John Smith", + "[ADDRESS1]": "123 Main St, New York, NY 10001", + "[PHONE1]": "(555) 123-4567" + }}, + "masked_text": "[PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]. He deposited $5,000 on 2023-07-15." +}} + +Example 2: +Placeholder list: +[PERSON1]: Sarah Johnson +[EMAIL1]: sarah.j@email.com + +Masked text: +[PERSON1] ordered a laptop from TechStore on 2023-05-15. Her email is [EMAIL1]. The total amount was $1,200. + +Output: +{{ + "mapping": {{ + "[PERSON1]": "Sarah Johnson", + "[EMAIL1]": "sarah.j@email.com" + }}, + "masked_text": "[PERSON1] ordered a laptop from TechStore on 2023-05-15. Her email is [EMAIL1]. The total amount was $1,200." +}} + +Example 3: +Placeholder list: +[PERSON1]: Dr. Emily Brown +[PERSON2]: Mr. David Lee + +Masked text: +[PERSON1], born on 1985-03-22, works at Central Hospital. Her patient, [PERSON2], has an appointment on 2023-06-10 at 2:30 PM. The procedure costs $3,500. + +Output: +{{ + "mapping": {{ + "[PERSON1]": "Dr. Emily Brown", + "[PERSON2]": "Mr. David Lee" + }}, + "masked_text": "[PERSON1], born on 1985-03-22, works at Central Hospital. Her patient, [PERSON2], has an appointment on 2023-06-10 at 2:30 PM. The procedure costs $3,500." +}} + +Now, please convert the following masked text and placeholder list into JSON format: + +{response_step1_content} + +##JSON +""" + def __init__(self, llm: LLM): super().__init__(llm) self.placeholder_counter = {} async def mask_content(self, content: str) -> MaskContract: - # Step 1: Get masked text and placeholder list + response_step1_content = await self._step1_mask_pii(content) + mask_contract = await self._step2_convert_to_json(response_step1_content) + return mask_contract + + async def _step1_mask_pii(self, content: str) -> str: messages_step1 = [ - { - "role": "system", - "content": "You are an AI assistant that masks sensitive information in text." - }, - { - "role": "user", - "content": f""" - Please mask all sensitive information in the following text. Replace sensitive information with placeholders like [PERSON1], [PERSON2], [ADDRESS1], [ADDRESS2], [PHONE1], [PHONE2], etc. Return the masked text and a list of placeholders with their original values. - - Keep all values, doesnt constitute sensitive information - Here are some examples: - - Example 1: - Original text: - John Smith lives at 123 Main St, New York, NY 10001. His phone number is (555) 123-4567. - - Placeholder list: - [PERSON1]: John Smith - [ADDRESS1]: 123 Main St, New York, NY 10001 - [PHONE1]: (555) 123-4567 - - Masked text: - [PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]. - - Example 2: - Original text: - Sarah Johnson ordered a laptop from TechStore on 2023-05-15. Her email is sarah.j@email.com. - - Placeholder list: - [PERSON1]: Sarah Johnson - [PRODUCT1]: laptop - [STORE1]: TechStore - [DATE1]: 2023-05-15 - [EMAIL1]: sarah.j@email.com - - Masked text: - [PERSON1] ordered a [PRODUCT1] from [STORE1] on [DATE1]. Her email is [EMAIL1]. - - Example 3: - Original text: - Dr. Emily Brown, born on 1985-03-22, works at Central Hospital. Her patient, Mr. David Lee, has an appointment on 2023-06-10 at 2:30 PM. - - Placeholder list: - [PERSON1]: Dr. Emily Brown - [DATE1]: 1985-03-22 - [HOSPITAL1]: Central Hospital - [PERSON2]: Mr. David Lee - [DATE2]: 2023-06-10 - [TIME1]: 2:30 PM - - Masked text: - [PERSON1], born on [DATE1], works at [HOSPITAL1]. Her patient, [PERSON2], has an appointment on [DATE2] at [TIME1]. - - Now, please mask the following text: - - Text to mask: - {content} - - Give me the placeholder list with the value and respective placeholder, and then the Masked text with the placeholders. - """ - } + {"role": "system", "content": self.MASK_PII_PROMPT}, + {"role": "user", "content": self.MASK_PII_USER_PROMPT.format(content=content)}, ] - response_step1 = self.llm.request(messages_step1) - response_step1_content = response_step1.choices[0].message.content + return response_step1_content - # Step 2: Convert to JSON format + async def _step2_convert_to_json(self, response_step1_content: str) -> MaskContract: messages_step2 = [ - { - "role": "system", - "content": "You are an AI assistant that converts masked text information into JSON format." - }, + {"role": "system", "content": self.CONVERT_TO_JSON_PROMPT}, { "role": "user", - "content": f""" - Convert the following masked texts and placeholder lists into a JSON format. For each example, the JSON should have two main keys: "mapping" (a dictionary of placeholders and their original values) and "masked_text" (the text with placeholders). - Always use [], not "" or '' - You can only have one placeholder for each type and vice versa. - Make sure that masked_text contains no sensitive information, only the placeholders. - Keep all values, doesnt constitute sensitive information - - Example 1: - Placeholder list: - [PERSON1]: John Smith - [ADDRESS1]: 123 Main St, New York, NY 10001 - [PHONE1]: (555) 123-4567 - - Masked text: - [PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]. - - Output: - {{ - "mapping": {{ - "[PERSON1]": "John Smith", - "[ADDRESS1]": "123 Main St, New York, NY 10001", - "[PHONE1]": "(555) 123-4567" - }}, - "masked_text": "[PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]." - }} - - Example 2: - Placeholder list: - [PERSON1]: Sarah Johnson - [PRODUCT1]: laptop - [STORE1]: TechStore - [DATE1]: 2023-05-15 - [EMAIL1]: sarah.j@email.com - - Masked text: - [PERSON1] ordered a [PRODUCT1] from [STORE1] on [DATE1]. Her email is [EMAIL1]. - - Output: - {{ - "mapping": {{ - "[PERSON1]": "Sarah Johnson", - "[PRODUCT1]": "laptop", - "[STORE1]": "TechStore", - "[DATE1]": "2023-05-15", - "[EMAIL1]": "sarah.j@email.com" - }}, - "masked_text": "[PERSON1] ordered a [PRODUCT1] from [STORE1] on [DATE1]. Her email is [EMAIL1]." - }} - - Example 3: - Placeholder list: - [PERSON1]: Dr. Emily Brown - [DATE1]: 1985-03-22 - [HOSPITAL1]: Central Hospital - [PERSON2]: Mr. David Lee - [DATE2]: 2023-06-10 - [TIME1]: 2:30 PM - - Masked text: - [PERSON1], born on [DATE1], works at [HOSPITAL1]. Her patient, [PERSON2], has an appointment on [DATE2] at [TIME1]. - - Output: - {{ - "mapping": {{ - "[PERSON1]": "Dr. Emily Brown", - "[DATE1]": "1985-03-22", - "[HOSPITAL1]": "Central Hospital", - "[PERSON2]": "Mr. David Lee", - "[DATE2]": "2023-06-10", - "[TIME1]": "2:30 PM" - }}, - "masked_text": "[PERSON1], born on [DATE1], works at [HOSPITAL1]. Her patient, [PERSON2], has an appointment on [DATE2] at [TIME1]." - }} - - Example 4: - Placeholder list: - [COMPANY1]: Company XYZ - [PERSON1]: Jane Doe - [COMPANY2]: ABC Corp - [DATE1]: July 1, 2023 - [AMOUNT1]: $500 million - - Masked text: - [COMPANY1]'s CEO, [PERSON1], announced a merger with [COMPANY2] on [DATE1]. The deal is valued at [AMOUNT1]. - - Output: - {{ - "mapping": {{ - "[COMPANY1]": "Company XYZ", - "[PERSON1]": "Jane Doe", - "[COMPANY2]": "ABC Corp", - "[DATE1]": "July 1, 2023", - "[AMOUNT1]": "$500 million" - }}, - "masked_text": "[COMPANY1]'s CEO, [PERSON1], announced a merger with [COMPANY2] on [DATE1]. The deal is valued at [AMOUNT1]." - }} - - Example 5: - Placeholder list: - [CREDITCARD1]: 4111-1111-1111-1111 - [PERSON1]: Michael Johnson - [DATE1]: 12/25 - [CVV1]: 123 - - Masked text: - The credit card number [CREDITCARD1] belongs to [PERSON1], expiring on [DATE1], with CVV [CVV1]. - - Output: - {{ - "mapping": {{ - "[CREDITCARD1]": "4111-1111-1111-1111", - "[PERSON1]": "Michael Johnson", - "[DATE1]": "12/25", - "[CVV1]": "123" - }}, - "masked_text": "The credit card number [CREDITCARD1] belongs to [PERSON1], expiring on [DATE1], with CVV [CVV1]." - }} - - Now, please convert the following masked text and placeholder list into JSON format: - - {response_step1_content} - - ##JSON - """ - } + "content": self.CONVERT_TO_JSON_USER_PROMPT.format( + response_step1_content=response_step1_content + ), + }, ] - response_step2 = self.llm.request(messages_step2, MaskContract) - masked_text = response_step2.masked_text mapping = response_step2.mapping for placeholder, value in mapping.items(): if value in masked_text: masked_text = masked_text.replace(value, placeholder) - response_step2.masked_text = masked_text - return response_step2 def get_placeholder(self, info_type): @@ -234,4 +170,4 @@ def get_placeholder(self, info_type): def unmask_content(self, masked_content: str, mapping: dict) -> str: for placeholder, original in mapping.items(): masked_content = masked_content.replace(placeholder, original) - return masked_content \ No newline at end of file + return masked_content diff --git a/tests/test_process.py b/tests/test_process.py index 15b5fc2..46db82e 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -2,16 +2,17 @@ import pytest from dotenv import load_dotenv -from extract_thinker.extractor import Extractor -from extract_thinker.process import MaskingStrategy, Process +import sys +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from extract_thinker.process import Process from extract_thinker.document_loader.document_loader_pypdf import DocumentLoaderPyPdf -from extract_thinker.llm import LLM import asyncio load_dotenv() cwd = os.getcwd() -def test_mask(): +async def test_mask(): # Arrange test_file_path = os.path.join(cwd, "tests", "files", "invoice.pdf") @@ -33,44 +34,98 @@ def test_mask(): "Our tax ID number is 12-3456789." ) - result = asyncio.run(process.mask_content(test_text)) + # Act + # result = asyncio.run(process.mask_content(test_text)) + result = await process.mask_content(test_text) # Assert assert result.masked_text is not None assert result.mapping is not None - # Check if all original sensitive information is masked - sensitive_info = [ - "George Collins", "123 Main St", "555-1234", - "Jane Smith", "456 Elm Avenue", "(987) 654-3210", - "Robert Johnson", "789 Corporate Blvd", - "1-800-555-9876", "support@example.com", - "Sarah Lee", "444-333-2222", "sarah.lee@company.com", - "$250,000", "$50,000", "$3,500", - "+1-555-987-6543", "12-3456789" + # Check if all original PII is masked + pii_info = ["George Collins", "Jane Smith", "Robert Johnson", "Sarah Lee"] + + non_pii_info = [ + "123 Main St", + "Anytown, USA 12345", + "555-1234", + "456 Elm Avenue", + "Othercity, State 67890", + "(987) 654-3210", + "789 Corporate Blvd, Suite 500, Bigcity, State 13579", + "1-800-555-9876", + "support@example.com", + "444-333-2222", + "sarah.lee@company.com", + "$250,000", + "$50,000", + "$3,500", + "+1-555-987-6543", + "12-3456789", ] - for info in sensitive_info: - assert info not in result.masked_text, f"{info} was not masked properly" - # Check if placeholders are present in masked text - placeholder_types = ["NAME", "ADDRESS", "PHONE", "EMAIL"] - assert any(f"[{type}" in result.masked_text for type in placeholder_types), "No expected placeholders found in masked text" + # Ensure PII is masked + for info in pii_info: + assert info not in result.masked_text, f"PII {info} was not masked properly" - # Check mapping - assert len(result.mapping) >= 10, "Mapping should contain at least 10 items" - assert all(key.startswith('[') and key.endswith(']') for key in result.mapping.keys()), "Mapping keys should be enclosed in square brackets" - assert all(isinstance(value, str) for value in result.mapping.values()), "Mapping values should be strings" + # Ensure non-PII data remains unchanged + for info in non_pii_info: + assert info in result.masked_text, f"Non-PII {info} was unexpectedly masked" + + # Check mapping contains only PII + expected_pii_placeholders = ["[PERSON1]", "[PERSON2]", "[PERSON3]", "[PERSON4]"] + for placeholder in expected_pii_placeholders: + assert placeholder in result.mapping, f"Expected placeholder {placeholder} not in mapping" + + assert len(result.mapping) == len(pii_info), "Mapping should contain only PII items" # Test unmasking unmasked_content = process.unmask_content(result.masked_text, result.mapping) - assert "George Collins" in unmasked_content, "Unmasking failed for 'George Collins'" - assert "123 Main St" in unmasked_content, "Unmasking failed for '123 Main St'" - assert "555-1234" in unmasked_content, "Unmasking failed for '555-1234'" + for info in pii_info: + assert info in unmasked_content, f"Unmasking failed for PII {info}" + + # Ensure non-PII data is unchanged after unmasking + for info in non_pii_info: + assert info in unmasked_content, f"Non-PII {info} was altered during unmasking" + +async def test_simple_use_case(): + # Arrange + test_file_path = os.path.join(cwd, "tests", "files", "invoice.pdf") + + process = Process() + process.load_document_loader(DocumentLoaderPyPdf()) + process.load_file(test_file_path) + process.add_masking_llm("groq/llama-3.2-3b-preview") - # Check if all masked content is unmasked - for placeholder, original in result.mapping.items(): - assert original in unmasked_content, f"Unmasking failed for {original}" - assert placeholder not in unmasked_content, f"Placeholder {placeholder} still present in unmasked content" + # Arrange + test_text = "John Doe transferred $5000 to Jane Smith on 2021-05-01." + + # Act + result = await process.mask_content(test_text) + + # Assert + assert result.masked_text is not None + assert result.mapping is not None + + # Ensure PII is masked + assert "John Doe" not in result.masked_text + assert "Jane Smith" not in result.masked_text + + # Ensure non-PII data remains + assert "$5000" in result.masked_text + assert "2021-05-01" in result.masked_text + assert "transferred" in result.masked_text + + # Check mapping + assert len(result.mapping) == 2 + assert "[PERSON1]" in result.mapping + assert "[PERSON2]" in result.mapping + assert result.mapping["[PERSON1]"] == "John Doe" + assert result.mapping["[PERSON2]"] == "Jane Smith" + + # Test unmasking + unmasked_content = process.unmask_content(result.masked_text, result.mapping) + assert unmasked_content == test_text # def test_mask_invoice(): # # Arrange @@ -144,4 +199,4 @@ def test_mask(): # assert test_text == unmasked_content, "Unmasked content does not match the original content" if __name__ == "__main__": - test_mask() \ No newline at end of file + asyncio.run(test_simple_use_case()) \ No newline at end of file From 941b084aceacf46e2e01304e2b0a9839dfe5920c Mon Sep 17 00:00:00 2001 From: julio Almeida Date: Tue, 22 Oct 2024 16:59:08 +0200 Subject: [PATCH 05/12] working on the prompt to only include PII --- .../simple_placeholder_masking_strategy.py | 23 ++-- tests/test_process.py | 104 +++++++++++++----- 2 files changed, 91 insertions(+), 36 deletions(-) diff --git a/extract_thinker/masking/simple_placeholder_masking_strategy.py b/extract_thinker/masking/simple_placeholder_masking_strategy.py index 60b0888..5fea6c6 100644 --- a/extract_thinker/masking/simple_placeholder_masking_strategy.py +++ b/extract_thinker/masking/simple_placeholder_masking_strategy.py @@ -7,10 +7,11 @@ class SimplePlaceholderMaskingStrategy(AbstractMaskingStrategy): MASK_PII_PROMPT = ( "You are an AI assistant that masks only Personally Identifiable Information (PII) in text. " - "Do not mask numerical values or non-PII data." + "Replace PII with placeholders in the format [TYPE#], e.g., [PERSON1], [ADDRESS1], [EMAIL1], etc. " + "Do not mask numerical values or non-PII data. Ensure placeholders do not contain underscores or spaces." ) - MASK_PII_USER_PROMPT = """Please mask all PII in the following text. Replace PII with placeholders like [PERSON1], [ADDRESS1], [EMAIL1], etc. Do not mask numerical values, dates, amounts, or other non-PII information. Return the masked text and a list of placeholders with their original values. + MASK_PII_USER_PROMPT = """Please mask all PII in the following text. Replace PII with placeholders like [PERSON1], [ADDRESS1], [EMAIL1], etc. Do not use underscores or spaces in placeholders. Do not mask numerical values, dates, amounts, or other non-PII information. Return the masked text and a list of placeholders with their original values. Here are some examples: @@ -58,10 +59,11 @@ class SimplePlaceholderMaskingStrategy(AbstractMaskingStrategy): CONVERT_TO_JSON_PROMPT = ( "You are an AI assistant that converts masked text information into JSON format, " - "preserving only the masking for PII." + "preserving only the masking for PII. Ensure that placeholders are strictly in the format [TYPE#], " + "without underscores or spaces." ) - CONVERT_TO_JSON_USER_PROMPT = """Convert the following masked texts and placeholder lists into a JSON format. For each example, the JSON should have two main keys: "mapping" (a dictionary of placeholders and their original PII values) and "masked_text" (the text with PII placeholders). Do not include non-PII information such as numerical values, dates, or amounts. + CONVERT_TO_JSON_USER_PROMPT = """Convert the following masked texts and placeholder lists into a JSON format. For each example, the JSON should have two main keys: "mapping" (a dictionary of placeholders and their original PII values) and "masked_text" (the text with PII placeholders). Do not include non-PII information such as numerical values, dates, or amounts. Ensure placeholders are in the correct format [TYPE#], without underscores or spaces. Example 1: Placeholder list: @@ -130,6 +132,7 @@ def __init__(self, llm: LLM): async def mask_content(self, content: str) -> MaskContract: response_step1_content = await self._step1_mask_pii(content) mask_contract = await self._step2_convert_to_json(response_step1_content) + self._validate_placeholders(mask_contract) return mask_contract async def _step1_mask_pii(self, content: str) -> str: @@ -161,13 +164,13 @@ async def _step2_convert_to_json(self, response_step1_content: str) -> MaskContr response_step2.masked_text = masked_text return response_step2 - def get_placeholder(self, info_type): - if info_type not in self.placeholder_counter: - self.placeholder_counter[info_type] = 0 - self.placeholder_counter[info_type] += 1 - return f"[{info_type}{self.placeholder_counter[info_type]}]" + def _validate_placeholders(self, mask_contract: MaskContract): + placeholder_pattern = re.compile(r'^\[[A-Za-z_]+[0-9]*\]$') + for placeholder in mask_contract.mapping.keys(): + if not placeholder_pattern.match(placeholder): + raise ValueError(f"Invalid placeholder format: {placeholder}") def unmask_content(self, masked_content: str, mapping: dict) -> str: for placeholder, original in mapping.items(): masked_content = masked_content.replace(placeholder, original) - return masked_content + return masked_content \ No newline at end of file diff --git a/tests/test_process.py b/tests/test_process.py index 46db82e..dcbabb0 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -19,7 +19,7 @@ async def test_mask(): process = Process() process.load_document_loader(DocumentLoaderPyPdf()) process.load_file(test_file_path) - process.add_masking_llm("groq/llama-3.2-3b-preview") + process.add_masking_llm("groq/llama-3.2-11b-text-preview") # Act test_text = ( @@ -43,51 +43,103 @@ async def test_mask(): assert result.mapping is not None # Check if all original PII is masked - pii_info = ["George Collins", "Jane Smith", "Robert Johnson", "Sarah Lee"] + pii_info = { + "persons": ["George Collins", "Jane Smith", "Robert Johnson", "Sarah Lee"], + "addresses": [ + "123 Main St, Anytown, USA 12345", + "456 Elm Avenue, Othercity, State 67890", + "789 Corporate Blvd, Suite 500, Bigcity, State 13579", + ], + "phones": ["555-1234", "(987) 654-3210", "1-800-555-9876", "444-333-2222", "+1-555-987-6543"], + "emails": ["support@example.com", "sarah.lee@company.com"], + "financial": ["$250,000", "$50,000", "$3,500"], + "tax_id": ["12-3456789"], + } non_pii_info = [ - "123 Main St", - "Anytown, USA 12345", - "555-1234", - "456 Elm Avenue", - "Othercity, State 67890", - "(987) 654-3210", - "789 Corporate Blvd, Suite 500, Bigcity, State 13579", - "1-800-555-9876", - "support@example.com", - "444-333-2222", - "sarah.lee@company.com", - "$250,000", - "$50,000", - "$3,500", - "+1-555-987-6543", - "12-3456789", + "Monthly maintenance costs are estimated at $3,500.", + "For international clients, please use +1-555-987-6543.", ] # Ensure PII is masked - for info in pii_info: - assert info not in result.masked_text, f"PII {info} was not masked properly" + for person in pii_info["persons"]: + assert person not in result.masked_text, f"PII {person} was not masked properly" + + for address in pii_info["addresses"]: + assert address not in result.masked_text, f"PII address {address} was not masked properly" + + for phone in pii_info["phones"]: + assert phone not in result.masked_text, f"PII phone {phone} was not masked properly" + + for email in pii_info["emails"]: + assert email not in result.masked_text, f"PII email {email} was not masked properly" + + for fin in pii_info["financial"]: + assert fin not in result.masked_text, f"PII financial info {fin} was not masked properly" + + for tax in pii_info["tax_id"]: + assert tax not in result.masked_text, f"PII tax ID {tax} was not masked properly" # Ensure non-PII data remains unchanged for info in non_pii_info: assert info in result.masked_text, f"Non-PII {info} was unexpectedly masked" - # Check mapping contains only PII - expected_pii_placeholders = ["[PERSON1]", "[PERSON2]", "[PERSON3]", "[PERSON4]"] + # Check mapping contains only PII with correct placeholders + expected_pii_placeholders = [ + "[PERSON1]", "[ADDRESS1]", "[PHONE1]", + "[PERSON2]", "[ADDRESS2]", "[EMAIL1]", + "[PERSON3]", "[ADDRESS3]", "[PHONE2]", + "[PHONE3]", "[EMAIL2]", "[Budget]", + "[Contingency]", "[PHONE4]", "[TaxID]" + ] + for placeholder in expected_pii_placeholders: assert placeholder in result.mapping, f"Expected placeholder {placeholder} not in mapping" - assert len(result.mapping) == len(pii_info), "Mapping should contain only PII items" + assert len(result.mapping) == ( + len(pii_info["persons"]) + + len(pii_info["addresses"]) + + len(pii_info["phones"]) + + len(pii_info["emails"]) + + len(pii_info["financial"]) + + len(pii_info["tax_id"]) + ), "Mapping should contain all PII items" + + # Verify mappings are correct + expected_mapping = { + "[PERSON1]": "George Collins", + "[ADDRESS1]": "123 Main St, Anytown, USA 12345", + "[PHONE1]": "555-1234", + "[PERSON2]": "Jane Smith", + "[ADDRESS2]": "456 Elm Avenue, Othercity, State 67890", + "[EMAIL1]": "support@example.com", + "[PERSON3]": "Robert Johnson", + "[ADDRESS3]": "789 Corporate Blvd, Suite 500, Bigcity, State 13579", + "[PHONE2]": "444-333-2222", + "[PHONE3]": "+1-555-987-6543", + "[EMAIL2]": "sarah.lee@company.com", + "[Budget]": "$250,000", + "[Contingency]": "$50,000", + "[PHONE4]": "1-800-555-9876", + "[TaxID]": "12-3456789", + } + + for placeholder, original in expected_mapping.items(): + assert result.mapping.get(placeholder) == original, f"Mapping for {placeholder} is incorrect" # Test unmasking unmasked_content = process.unmask_content(result.masked_text, result.mapping) - for info in pii_info: - assert info in unmasked_content, f"Unmasking failed for PII {info}" + for category in pii_info.values(): + for info in category: + assert info in unmasked_content, f"Unmasking failed for PII {info}" # Ensure non-PII data is unchanged after unmasking for info in non_pii_info: assert info in unmasked_content, f"Non-PII {info} was altered during unmasking" + # Optionally, verify the entire unmasked content matches the original + assert unmasked_content == test_text, "Unmasked content does not match the original content" + async def test_simple_use_case(): # Arrange test_file_path = os.path.join(cwd, "tests", "files", "invoice.pdf") @@ -199,4 +251,4 @@ async def test_simple_use_case(): # assert test_text == unmasked_content, "Unmasked content does not match the original content" if __name__ == "__main__": - asyncio.run(test_simple_use_case()) \ No newline at end of file + asyncio.run(test_mask()) \ No newline at end of file From ce225de1210d94ab27cbd2738f7093521ad66ec8 Mon Sep 17 00:00:00 2001 From: julio Almeida Date: Tue, 22 Oct 2024 18:23:00 +0200 Subject: [PATCH 06/12] adding a couple of non PII elements to the prompt --- .../simple_placeholder_masking_strategy.py | 28 +++++----- tests/test_process.py | 55 ------------------- 2 files changed, 15 insertions(+), 68 deletions(-) diff --git a/extract_thinker/masking/simple_placeholder_masking_strategy.py b/extract_thinker/masking/simple_placeholder_masking_strategy.py index 5fea6c6..2c2a7e0 100644 --- a/extract_thinker/masking/simple_placeholder_masking_strategy.py +++ b/extract_thinker/masking/simple_placeholder_masking_strategy.py @@ -11,43 +11,45 @@ class SimplePlaceholderMaskingStrategy(AbstractMaskingStrategy): "Do not mask numerical values or non-PII data. Ensure placeholders do not contain underscores or spaces." ) - MASK_PII_USER_PROMPT = """Please mask all PII in the following text. Replace PII with placeholders like [PERSON1], [ADDRESS1], [EMAIL1], etc. Do not use underscores or spaces in placeholders. Do not mask numerical values, dates, amounts, or other non-PII information. Return the masked text and a list of placeholders with their original values. + MASK_PII_USER_PROMPT = """Please mask only Personally Identifiable Information (PII) in the following text. Replace PII with placeholders like [PERSON1], [ADDRESS1], [EMAIL1], [PHONE1], [TAXID1], etc. Do not use underscores or spaces in placeholders. Mask names, addresses, email addresses, phone numbers (including international formats), tax IDs, and other PII. Do not mask numerical values (except phone numbers and tax IDs), dates, amounts, or other non-PII information. Return the masked text and a list of placeholders with their original values. -Here are some examples: +Here are some examples of correct masking: Example 1: Original text: -John Smith lives at 123 Main St, New York, NY 10001. His phone number is (555) 123-4567. He deposited $5,000 on 2023-07-15. +John Smith lives at 123 Main St, New York, NY 10001. His phone number is (555) 123-4567 and his SSN is 123-45-6789. For international calls, use +1-555-987-6543. He deposited $5,000 on 2023-07-15. Placeholder list: [PERSON1]: John Smith [ADDRESS1]: 123 Main St, New York, NY 10001 [PHONE1]: (555) 123-4567 +[TAXID1]: 123-45-6789 +[PHONE2]: +1-555-987-6543 Masked text: -[PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]. He deposited $5,000 on 2023-07-15. +[PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1] and his SSN is [TAXID1]. For international calls, use [PHONE2]. He deposited $5,000 on 2023-07-15. Example 2: Original text: -Sarah Johnson ordered a laptop from TechStore on 2023-05-15. Her email is sarah.j@email.com. The total amount was $1,200. +Sarah Johnson ordered a laptop from TechStore on 2023-05-15. Her email is sarah.j@email.com and her work number is 1-800-555-1234. The company's EIN is 12-3456789. The total amount was $1,200. Placeholder list: [PERSON1]: Sarah Johnson [EMAIL1]: sarah.j@email.com +[PHONE1]: 1-800-555-1234 +[TAXID1]: 12-3456789 Masked text: -[PERSON1] ordered a laptop from TechStore on 2023-05-15. Her email is [EMAIL1]. The total amount was $1,200. +[PERSON1] ordered a laptop from TechStore on 2023-05-15. Her email is [EMAIL1] and her work number is [PHONE1]. The company's EIN is [TAXID1]. The total amount was $1,200. -Example 3: +Example 3 (Demonstrating what NOT to mask): Original text: -Dr. Emily Brown, born on 1985-03-22, works at Central Hospital. Her patient, Mr. David Lee, has an appointment on 2023-06-10 at 2:30 PM. The procedure costs $3,500. - -Placeholder list: -[PERSON1]: Dr. Emily Brown -[PERSON2]: Mr. David Lee +The company's revenue was $10,000,000 last year. Project XYZ has a budget of $500,000 and is due on 2023-12-31. The office can accommodate 50 employees. Masked text: -[PERSON1], born on 1985-03-22, works at Central Hospital. Her patient, [PERSON2], has an appointment on 2023-06-10 at 2:30 PM. The procedure costs $3,500. +The company's revenue was $10,000,000 last year. Project XYZ has a budget of $500,000 and is due on 2023-12-31. The office can accommodate 50 employees. + +Note: In this example, no masking is performed because there is no PII present. Numerical values (except phone numbers and tax IDs), project names, and dates are not considered PII. Now, please mask the following text: diff --git a/tests/test_process.py b/tests/test_process.py index dcbabb0..a35890e 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -52,13 +52,11 @@ async def test_mask(): ], "phones": ["555-1234", "(987) 654-3210", "1-800-555-9876", "444-333-2222", "+1-555-987-6543"], "emails": ["support@example.com", "sarah.lee@company.com"], - "financial": ["$250,000", "$50,000", "$3,500"], "tax_id": ["12-3456789"], } non_pii_info = [ "Monthly maintenance costs are estimated at $3,500.", - "For international clients, please use +1-555-987-6543.", ] # Ensure PII is masked @@ -74,9 +72,6 @@ async def test_mask(): for email in pii_info["emails"]: assert email not in result.masked_text, f"PII email {email} was not masked properly" - for fin in pii_info["financial"]: - assert fin not in result.masked_text, f"PII financial info {fin} was not masked properly" - for tax in pii_info["tax_id"]: assert tax not in result.masked_text, f"PII tax ID {tax} was not masked properly" @@ -84,58 +79,8 @@ async def test_mask(): for info in non_pii_info: assert info in result.masked_text, f"Non-PII {info} was unexpectedly masked" - # Check mapping contains only PII with correct placeholders - expected_pii_placeholders = [ - "[PERSON1]", "[ADDRESS1]", "[PHONE1]", - "[PERSON2]", "[ADDRESS2]", "[EMAIL1]", - "[PERSON3]", "[ADDRESS3]", "[PHONE2]", - "[PHONE3]", "[EMAIL2]", "[Budget]", - "[Contingency]", "[PHONE4]", "[TaxID]" - ] - - for placeholder in expected_pii_placeholders: - assert placeholder in result.mapping, f"Expected placeholder {placeholder} not in mapping" - - assert len(result.mapping) == ( - len(pii_info["persons"]) + - len(pii_info["addresses"]) + - len(pii_info["phones"]) + - len(pii_info["emails"]) + - len(pii_info["financial"]) + - len(pii_info["tax_id"]) - ), "Mapping should contain all PII items" - - # Verify mappings are correct - expected_mapping = { - "[PERSON1]": "George Collins", - "[ADDRESS1]": "123 Main St, Anytown, USA 12345", - "[PHONE1]": "555-1234", - "[PERSON2]": "Jane Smith", - "[ADDRESS2]": "456 Elm Avenue, Othercity, State 67890", - "[EMAIL1]": "support@example.com", - "[PERSON3]": "Robert Johnson", - "[ADDRESS3]": "789 Corporate Blvd, Suite 500, Bigcity, State 13579", - "[PHONE2]": "444-333-2222", - "[PHONE3]": "+1-555-987-6543", - "[EMAIL2]": "sarah.lee@company.com", - "[Budget]": "$250,000", - "[Contingency]": "$50,000", - "[PHONE4]": "1-800-555-9876", - "[TaxID]": "12-3456789", - } - - for placeholder, original in expected_mapping.items(): - assert result.mapping.get(placeholder) == original, f"Mapping for {placeholder} is incorrect" - # Test unmasking unmasked_content = process.unmask_content(result.masked_text, result.mapping) - for category in pii_info.values(): - for info in category: - assert info in unmasked_content, f"Unmasking failed for PII {info}" - - # Ensure non-PII data is unchanged after unmasking - for info in non_pii_info: - assert info in unmasked_content, f"Non-PII {info} was altered during unmasking" # Optionally, verify the entire unmasked content matches the original assert unmasked_content == test_text, "Unmasked content does not match the original content" From b8888a8b7e662636c53f0e983566df185f624faa Mon Sep 17 00:00:00 2001 From: julio Almeida Date: Tue, 22 Oct 2024 20:19:44 +0200 Subject: [PATCH 07/12] before adding ReAct --- .../simple_placeholder_masking_strategy.py | 85 ++++++------------ extract_thinker/models/MaskContract.py | 6 ++ tests/test_process.py | 87 ++----------------- 3 files changed, 41 insertions(+), 137 deletions(-) diff --git a/extract_thinker/masking/simple_placeholder_masking_strategy.py b/extract_thinker/masking/simple_placeholder_masking_strategy.py index 2c2a7e0..1707ac1 100644 --- a/extract_thinker/masking/simple_placeholder_masking_strategy.py +++ b/extract_thinker/masking/simple_placeholder_masking_strategy.py @@ -1,7 +1,7 @@ import re from extract_thinker.llm import LLM from extract_thinker.masking.abstract_masking_strategy import AbstractMaskingStrategy -from extract_thinker.models.MaskContract import MaskContract +from extract_thinker.models.MaskContract import MaskContract, MaskContractDict class SimplePlaceholderMaskingStrategy(AbstractMaskingStrategy): @@ -9,6 +9,9 @@ class SimplePlaceholderMaskingStrategy(AbstractMaskingStrategy): "You are an AI assistant that masks only Personally Identifiable Information (PII) in text. " "Replace PII with placeholders in the format [TYPE#], e.g., [PERSON1], [ADDRESS1], [EMAIL1], etc. " "Do not mask numerical values or non-PII data. Ensure placeholders do not contain underscores or spaces." + "Do not mask the key value result, they will be masked later." + "Dont return masked text, only the placeholder list." + "Provide a step-by-step reasoning when identifying PII." ) MASK_PII_USER_PROMPT = """Please mask only Personally Identifiable Information (PII) in the following text. Replace PII with placeholders like [PERSON1], [ADDRESS1], [EMAIL1], [PHONE1], [TAXID1], etc. Do not use underscores or spaces in placeholders. Mask names, addresses, email addresses, phone numbers (including international formats), tax IDs, and other PII. Do not mask numerical values (except phone numbers and tax IDs), dates, amounts, or other non-PII information. Return the masked text and a list of placeholders with their original values. @@ -26,9 +29,6 @@ class SimplePlaceholderMaskingStrategy(AbstractMaskingStrategy): [TAXID1]: 123-45-6789 [PHONE2]: +1-555-987-6543 -Masked text: -[PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1] and his SSN is [TAXID1]. For international calls, use [PHONE2]. He deposited $5,000 on 2023-07-15. - Example 2: Original text: Sarah Johnson ordered a laptop from TechStore on 2023-05-15. Her email is sarah.j@email.com and her work number is 1-800-555-1234. The company's EIN is 12-3456789. The total amount was $1,200. @@ -39,16 +39,10 @@ class SimplePlaceholderMaskingStrategy(AbstractMaskingStrategy): [PHONE1]: 1-800-555-1234 [TAXID1]: 12-3456789 -Masked text: -[PERSON1] ordered a laptop from TechStore on 2023-05-15. Her email is [EMAIL1] and her work number is [PHONE1]. The company's EIN is [TAXID1]. The total amount was $1,200. - Example 3 (Demonstrating what NOT to mask): Original text: The company's revenue was $10,000,000 last year. Project XYZ has a budget of $500,000 and is due on 2023-12-31. The office can accommodate 50 employees. -Masked text: -The company's revenue was $10,000,000 last year. Project XYZ has a budget of $500,000 and is due on 2023-12-31. The office can accommodate 50 employees. - Note: In this example, no masking is performed because there is no PII present. Numerical values (except phone numbers and tax IDs), project names, and dates are not considered PII. Now, please mask the following text: @@ -56,16 +50,15 @@ class SimplePlaceholderMaskingStrategy(AbstractMaskingStrategy): Text to mask: {content} -Provide the placeholder list with their original values, followed by the masked text. +Provide your step-by-step reasoning, the placeholder list with their original values, and the masked text. """ CONVERT_TO_JSON_PROMPT = ( - "You are an AI assistant that converts masked text information into JSON format, " - "preserving only the masking for PII. Ensure that placeholders are strictly in the format [TYPE#], " - "without underscores or spaces." + "You are an AI assistant that converts placeholder lists into JSON format. " + "Ensure that placeholders are strictly in the format [TYPE#], without underscores or spaces." ) - CONVERT_TO_JSON_USER_PROMPT = """Convert the following masked texts and placeholder lists into a JSON format. For each example, the JSON should have two main keys: "mapping" (a dictionary of placeholders and their original PII values) and "masked_text" (the text with PII placeholders). Do not include non-PII information such as numerical values, dates, or amounts. Ensure placeholders are in the correct format [TYPE#], without underscores or spaces. + CONVERT_TO_JSON_USER_PROMPT = """Convert the following placeholder lists into a JSON format. For each example, the JSON should have a single key: "mapping" (a dictionary of placeholders and their original PII values). Ensure placeholders are in the correct format [TYPE#], without underscores or spaces. Example 1: Placeholder list: @@ -73,17 +66,13 @@ class SimplePlaceholderMaskingStrategy(AbstractMaskingStrategy): [ADDRESS1]: 123 Main St, New York, NY 10001 [PHONE1]: (555) 123-4567 -Masked text: -[PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]. He deposited $5,000 on 2023-07-15. - Output: {{ "mapping": {{ "[PERSON1]": "John Smith", "[ADDRESS1]": "123 Main St, New York, NY 10001", "[PHONE1]": "(555) 123-4567" - }}, - "masked_text": "[PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]. He deposited $5,000 on 2023-07-15." + }} }} Example 2: @@ -91,36 +80,15 @@ class SimplePlaceholderMaskingStrategy(AbstractMaskingStrategy): [PERSON1]: Sarah Johnson [EMAIL1]: sarah.j@email.com -Masked text: -[PERSON1] ordered a laptop from TechStore on 2023-05-15. Her email is [EMAIL1]. The total amount was $1,200. - Output: {{ "mapping": {{ "[PERSON1]": "Sarah Johnson", "[EMAIL1]": "sarah.j@email.com" - }}, - "masked_text": "[PERSON1] ordered a laptop from TechStore on 2023-05-15. Her email is [EMAIL1]. The total amount was $1,200." -}} - -Example 3: -Placeholder list: -[PERSON1]: Dr. Emily Brown -[PERSON2]: Mr. David Lee - -Masked text: -[PERSON1], born on 1985-03-22, works at Central Hospital. Her patient, [PERSON2], has an appointment on 2023-06-10 at 2:30 PM. The procedure costs $3,500. - -Output: -{{ - "mapping": {{ - "[PERSON1]": "Dr. Emily Brown", - "[PERSON2]": "Mr. David Lee" - }}, - "masked_text": "[PERSON1], born on 1985-03-22, works at Central Hospital. Her patient, [PERSON2], has an appointment on 2023-06-10 at 2:30 PM. The procedure costs $3,500." + }} }} -Now, please convert the following masked text and placeholder list into JSON format: +Now, please convert the following placeholder list into JSON format: {response_step1_content} @@ -133,9 +101,15 @@ def __init__(self, llm: LLM): async def mask_content(self, content: str) -> MaskContract: response_step1_content = await self._step1_mask_pii(content) - mask_contract = await self._step2_convert_to_json(response_step1_content) - self._validate_placeholders(mask_contract) - return mask_contract + response_step2_content = await self._step2_convert_to_json(response_step1_content) + result = self._parse_mask_contract_dict(response_step2_content.mapping, content) + return result + + def _parse_mask_contract_dict(self, mapping: dict, content: str) -> MaskContract: + masked_text = content + for placeholder, value in mapping.items(): + masked_text = masked_text.replace(value, placeholder) + return MaskContract(masked_text=masked_text, mapping=mapping) async def _step1_mask_pii(self, content: str) -> str: messages_step1 = [ @@ -146,28 +120,21 @@ async def _step1_mask_pii(self, content: str) -> str: response_step1_content = response_step1.choices[0].message.content return response_step1_content - async def _step2_convert_to_json(self, response_step1_content: str) -> MaskContract: + async def _step2_convert_to_json(self, response_step1_content: str) -> MaskContractDict: messages_step2 = [ {"role": "system", "content": self.CONVERT_TO_JSON_PROMPT}, { "role": "user", "content": self.CONVERT_TO_JSON_USER_PROMPT.format( - response_step1_content=response_step1_content + response_step1_content=response_step1_content, ), }, ] - response_step2 = self.llm.request(messages_step2, MaskContract) - masked_text = response_step2.masked_text - mapping = response_step2.mapping - - for placeholder, value in mapping.items(): - if value in masked_text: - masked_text = masked_text.replace(value, placeholder) - response_step2.masked_text = masked_text + response_step2 = self.llm.request(messages_step2, MaskContractDict) return response_step2 - + def _validate_placeholders(self, mask_contract: MaskContract): - placeholder_pattern = re.compile(r'^\[[A-Za-z_]+[0-9]*\]$') + placeholder_pattern = re.compile(r'^\[[A-Za-z]+[0-9]*\]$') for placeholder in mask_contract.mapping.keys(): if not placeholder_pattern.match(placeholder): raise ValueError(f"Invalid placeholder format: {placeholder}") @@ -175,4 +142,4 @@ def _validate_placeholders(self, mask_contract: MaskContract): def unmask_content(self, masked_content: str, mapping: dict) -> str: for placeholder, original in mapping.items(): masked_content = masked_content.replace(placeholder, original) - return masked_content \ No newline at end of file + return masked_content diff --git a/extract_thinker/models/MaskContract.py b/extract_thinker/models/MaskContract.py index f8a15ad..fb30b96 100644 --- a/extract_thinker/models/MaskContract.py +++ b/extract_thinker/models/MaskContract.py @@ -3,4 +3,10 @@ class MaskContract(BaseModel): masked_text: str = Field(description="The masked version of the text") + mapping: Dict[str, str] = Field(description="A dictionary mapping placeholders to original values") + + def __init__(self, masked_text: str, mapping: Dict[str, str]): + super().__init__(masked_text=masked_text, mapping=mapping) + +class MaskContractDict(BaseModel): mapping: Dict[str, str] = Field(description="A dictionary mapping placeholders to original values") \ No newline at end of file diff --git a/tests/test_process.py b/tests/test_process.py index a35890e..4a8ba2e 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -12,7 +12,7 @@ load_dotenv() cwd = os.getcwd() -async def test_mask(): +def test_mask(): # Arrange test_file_path = os.path.join(cwd, "tests", "files", "invoice.pdf") @@ -35,8 +35,7 @@ async def test_mask(): ) # Act - # result = asyncio.run(process.mask_content(test_text)) - result = await process.mask_content(test_text) + result = asyncio.run(process.mask_content(test_text)) # Assert assert result.masked_text is not None @@ -79,26 +78,29 @@ async def test_mask(): for info in non_pii_info: assert info in result.masked_text, f"Non-PII {info} was unexpectedly masked" + # check if mapping length is 15 + assert len(result.mapping) == 15, "Mapping should contain 15 items" + # Test unmasking unmasked_content = process.unmask_content(result.masked_text, result.mapping) # Optionally, verify the entire unmasked content matches the original assert unmasked_content == test_text, "Unmasked content does not match the original content" -async def test_simple_use_case(): +def test_simple_use_case(): # Arrange test_file_path = os.path.join(cwd, "tests", "files", "invoice.pdf") process = Process() process.load_document_loader(DocumentLoaderPyPdf()) process.load_file(test_file_path) - process.add_masking_llm("groq/llama-3.2-3b-preview") + process.add_masking_llm("groq/llama-3.2-11b-text-preview") # Arrange test_text = "John Doe transferred $5000 to Jane Smith on 2021-05-01." # Act - result = await process.mask_content(test_text) + result = asyncio.run(process.mask_content(test_text)) # Assert assert result.masked_text is not None @@ -124,76 +126,5 @@ async def test_simple_use_case(): unmasked_content = process.unmask_content(result.masked_text, result.mapping) assert unmasked_content == test_text -# def test_mask_invoice(): -# # Arrange -# test_file_path = os.path.join(cwd, "tests", "files", "invoice.pdf") - -# process = Process() -# process.load_document_loader(DocumentLoaderPyPdf()) -# process.load_file(test_file_path) -# process.add_masking_llm("groq/llama-3.2-11b-text-preview") - -# # Act -# test_text = ( -# "Market Financial Consulting Experts in earning trusts 450 East 78th Ave Denver, CO 12345 " -# "Phone : (123) 456 -7890 Fax: (123) 456 -7891 INVOICE INVOICE: 00012 DATE: 1/30/23 " -# "TO: Gaurav Cheema Caneiro Group 89 Pacific Ave San Francisco, CA 78910 " -# "FOR: Consultation services DESCRIPTION HOURS RATE AMOUNT " -# "Consultation services 3.0 375.00 1125.00 " -# "TOTAL 1125.00 Make all checks payable to Market Financial Consulting " -# "Total due in 15 days. Overdue accounts subject to a service charge of 1% per month. " -# "THANK YOU FOR YOUR BUSINESS!" -# ) - -# result = asyncio.run(process.mask_content(test_text)) - -# # Assert -# assert result.masked_text is not None -# assert result.mapping is not None - -# # Check if all original sensitive information is masked -# sensitive_info = [ -# "Market Financial Consulting", "450 East 78th Ave", "Denver, CO 12345", -# "(123) 456 -7890", "(123) 456 -7891", -# "Gaurav Cheema", "Caneiro Group", "89 Pacific Ave", "San Francisco, CA 78910" -# ] -# for info in sensitive_info: -# assert info not in result.masked_text, f"{info} was not masked properly" - -# # Check if placeholders are present in masked text -# placeholder_types = ["COMPANY", "ADDRESS", "PHONE", "NAME"] -# assert any(f"[{type}" in result.masked_text for type in placeholder_types), "No expected placeholders found in masked text" - -# # Check if non-sensitive information is still present -# non_sensitive_info = [ -# "INVOICE", "DATE", "FOR: Consultation services", -# "DESCRIPTION", "HOURS", "RATE", "AMOUNT", -# "3.0", "375.00", "1125.00", "TOTAL", -# "Make all checks payable to", -# "Total due in 15 days. Overdue accounts subject to a service charge of 1% per month.", -# "THANK YOU FOR YOUR BUSINESS!" -# ] -# for info in non_sensitive_info: -# assert info in result.masked_text, f"{info} was unexpectedly masked" - -# # Check mapping -# assert len(result.mapping) >= 5, "Mapping should contain at least 5 items" -# assert all(key.startswith('[') and key.endswith(']') for key in result.mapping.keys()), "Mapping keys should be enclosed in square brackets" -# assert all(isinstance(value, str) for value in result.mapping.values()), "Mapping values should be strings" - -# # Test unmasking -# unmasked_content = process.unmask_content(result.masked_text, result.mapping) -# assert "Market Financial Consulting" in unmasked_content, "Unmasking failed for 'Market Financial Consulting'" -# assert "450 East 78th Ave" in unmasked_content, "Unmasking failed for '450 East 78th Ave'" -# assert "Gaurav Cheema" in unmasked_content, "Unmasking failed for 'Gaurav Cheema'" - -# # Check if all masked content is unmasked -# for placeholder, original in result.mapping.items(): -# assert original in unmasked_content, f"Unmasking failed for {original}" -# assert placeholder not in unmasked_content, f"Placeholder {placeholder} still present in unmasked content" - -# # Check if all masked content is the same as the original content -# assert test_text == unmasked_content, "Unmasked content does not match the original content" - if __name__ == "__main__": - asyncio.run(test_mask()) \ No newline at end of file + asyncio.run(test_simple_use_case()) \ No newline at end of file From 0cf452d1a11bb4abec8b5b3bf80c0189ed83a54f Mon Sep 17 00:00:00 2001 From: julio Almeida Date: Tue, 22 Oct 2024 22:42:38 +0200 Subject: [PATCH 08/12] deterministic hashing added --- .../deterministic_hashing_masking_strategy.py | 220 ++++++++++++++++++ .../simple_placeholder_masking_strategy.py | 220 +++++++++++++++--- extract_thinker/process.py | 4 + poetry.lock | 151 +++++++++++- pyproject.toml | 1 + tests/test_process.py | 23 +- 6 files changed, 572 insertions(+), 47 deletions(-) create mode 100644 extract_thinker/masking/deterministic_hashing_masking_strategy.py diff --git a/extract_thinker/masking/deterministic_hashing_masking_strategy.py b/extract_thinker/masking/deterministic_hashing_masking_strategy.py new file mode 100644 index 0000000..c0b4f9d --- /dev/null +++ b/extract_thinker/masking/deterministic_hashing_masking_strategy.py @@ -0,0 +1,220 @@ +import re +import hashlib +from extract_thinker.llm import LLM +from extract_thinker.masking.abstract_masking_strategy import AbstractMaskingStrategy +from extract_thinker.models.MaskContract import MaskContract, MaskContractDict +from cryptography.hazmat.primitives import hashes +from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC +from cryptography.hazmat.backends import default_backend +import base64 + + +class DeterministicHashingMaskingStrategy(AbstractMaskingStrategy): + MASK_PII_PROMPT = ( + "You are an AI assistant that masks only Personally Identifiable Information (PII) in text. " + "Replace PII with placeholders in the format [TYPE#], e.g., [PERSON1], [ADDRESS1], [EMAIL1], etc. " + "Do not mask numerical values or non-PII data. Ensure placeholders do not contain underscores or spaces." + "Do not mask the key value result, they will be masked later." + "Don't return masked text, only the placeholder list." + "Values and Amounts(e.g $1000) are not PII values. The same for dates" + "Provide a step-by-step reasoning when identifying PII." + "Always return ##Placeholder list: as part of the response" + ) + + MASK_PII_USER_PROMPT = """Task: Mask personally identifiable information (PII) in the provided text, replacing PII with placeholders like [PERSON1], [ADDRESS1], [EMAIL1], etc. Do not mask numerical values unless they are phone numbers or tax IDs. Return only the placeholder list with reasoning for each identified PII. + +Step 1: Reasoning & Thought Process +1. Analyze the text: + - Carefully examine each part of the text to determine if it contains PII. + - Focus on identifying common types of PII such as names, email addresses, phone numbers, tax IDs, and physical addresses. + - Ignore non-PII data such as dates, numerical values (except phone numbers and tax IDs), and any other non-sensitive information. + +2. Justify the decision: + - For each segment identified as PII, explain why it qualifies as such. + - Clearly differentiate between PII and non-PII elements. Provide reasoning for why certain elements are not PII. + +Step 2: Action +1. Mask PII: + - Replace each identified PII with an appropriate placeholder in the format [TYPE#] (e.g., [PERSON1], [ADDRESS1]). + - Do not mask any non-PII elements. + +2. Return placeholder list: + - Return a list of placeholders and their corresponding original values (but do not return the masked text). + - Ensure placeholders are formatted without underscores or spaces. + +Examples: + +Example 1: +Original text: +John Smith lives at 123 Main St, New York, NY 10001. His phone number is (555) 123-4567 and his SSN is 123-45-6789. For international calls, use +1-555-987-6543. He deposited $5,000 on 2023-07-15. + +Output: +##Placeholder list: +[PERSON1]: John Smith +[ADDRESS1]: 123 Main St, New York, NY 10001 +[PHONE1]: (555) 123-4567 +[TAXID1]: 123-45-6789 +[PHONE2]: +1-555-987-6543 + +Example 2: +Original text: +Sarah Johnson ordered a laptop from TechStore on 2023-05-15. Her email is sarah.j@email.com and her work number is 1-800-555-1234. The company's EIN is 12-3456789. The total amount was $1,200. + +Output: +##Placeholder list: +[PERSON1]: Sarah Johnson +[EMAIL1]: sarah.j@email.com +[PHONE1]: 1-800-555-1234 +[TAXID1]: 12-3456789 + +Example 3 (Demonstrating what NOT to mask): +Original text: +The company's revenue was $10,000,000 last year. Project XYZ has a budget of $500,000 and is due on 2023-12-31. The office can accommodate 50 employees. + +Note: In this example, no masking is performed because there is no PII present. Numerical values (except phone numbers and tax IDs), project names, and dates are not considered PII. + +Example 4: +Original text: +John Doe transferred $5000 to Jane Smith on 2021-05-01. + +Step 1: Reasoning & Thought Process +Upon analyzing the text "John Doe transferred $5000 to Jane Smith on 2021-05-01.", we need to identify any PII present. + +1. Identifying PII Types: The common types of PII we're looking for are names (e.g., John Doe, Jane Smith), email addresses, phone numbers, tax IDs, and physical addresses. +2. Examining Text Segments: + - "John Doe" - This is a name, which is a type of PII. + - "Jane Smith" - This is another name, which is a type of PII. + - "$5000" - This is a financial transaction amount, not a phone number or tax ID, so it's not a type of PII in this context. Numerical values like this are often found in everyday text and aren't PII. + - "2021-05-01" - This is a date, which is not PII because it doesn't contain identifying information about a person. + +Step 2: Action +Based on the identified PII types and segments, we'll create placeholders for each PII found. + +1. Masking PII: We'll replace each identified PII with an appropriate placeholder in the format [TYPE#]. +2. Returning Placeholder List: We'll return a list of placeholders and their corresponding original values. + +Output: +##Placeholder list: +[PERSON1]: John Doe +[PERSON2]: Jane Smith + +Text to mask: +{content} + +Provide your step-by-step reasoning, and then return the placeholder list. +""" + + CONVERT_TO_JSON_PROMPT = ( + "You are an AI assistant that converts placeholder lists into JSON format. " + "Ensure that placeholders are strictly in the format [TYPE#], without underscores or spaces." + ) + + CONVERT_TO_JSON_USER_PROMPT = """Convert the following placeholder lists into a JSON format. For each example, the JSON should have a single key: "mapping" (a dictionary of placeholders and their original PII values). Ensure placeholders are in the correct format [TYPE#], without underscores or spaces. + +Example 1: +Placeholder list: +[PERSON1]: John Smith +[ADDRESS1]: 123 Main St, New York, NY 10001 +[PHONE1]: (555) 123-4567 + +Output: +{{ + "mapping": {{ + "[PERSON1]": "John Smith", + "[ADDRESS1]": "123 Main St, New York, NY 10001", + "[PHONE1]": "(555) 123-4567" + }} +}} + +Example 2: +Placeholder list: +[PERSON1]: Sarah Johnson +[EMAIL1]: sarah.j@email.com + +Output: +{{ + "mapping": {{ + "[PERSON1]": "Sarah Johnson", + "[EMAIL1]": "sarah.j@email.com" + }} +}} + +Now, please convert the following placeholder list into JSON format: + +{response_step1_content} + +##JSON +""" + + def __init__(self, llm: LLM): + super().__init__(llm) + self.placeholder_counter = {} + + async def mask_content(self, content: str) -> MaskContract: + response_step1_content = await self._step1_mask_pii(content) + response_step2_content = await self._step2_convert_to_json(response_step1_content) + result = self._parse_mask_contract_dict(response_step2_content.mapping, content) + return result + + def _parse_mask_contract_dict(self, mapping: dict, content: str) -> MaskContract: + masked_text = content + for placeholder, value in mapping.items(): + hash_value = self._deterministic_hash(value) + masked_text = masked_text.replace(value, f"{hash_value}") + return MaskContract(masked_text=masked_text, mapping=mapping) + + async def _step1_mask_pii(self, content: str) -> str: + messages_step1 = [ + {"role": "system", "content": self.MASK_PII_PROMPT}, + {"role": "user", "content": self.MASK_PII_USER_PROMPT.format(content=content)}, + ] + response_step1 = self.llm.request(messages_step1) + response_step1_content = response_step1.choices[0].message.content + + # Split the response into reasoning and the placeholder list + split_result = response_step1_content.split("##Placeholder list:") + if len(split_result) == 2: + reasoning_part = split_result[0].strip() + placeholder_list = split_result[1].strip() + else: + raise ValueError("Unexpected response format: 'Placeholder List' section not found.") + + # Return only the placeholder list + return placeholder_list + + async def _step2_convert_to_json(self, response_step1_content: str) -> MaskContractDict: + messages_step2 = [ + {"role": "system", "content": self.CONVERT_TO_JSON_PROMPT}, + { + "role": "user", + "content": self.CONVERT_TO_JSON_USER_PROMPT.format( + response_step1_content=response_step1_content, + ), + }, + ] + response_step2 = self.llm.request(messages_step2, MaskContractDict) + return response_step2 + + def _validate_placeholders(self, mask_contract: MaskContract): + placeholder_pattern = re.compile(r'^\[[A-Za-z]+[0-9]*\]$') + for placeholder in mask_contract.mapping.keys(): + if not placeholder_pattern.match(placeholder): + raise ValueError(f"Invalid placeholder format: {placeholder}") + + def unmask_content(self, masked_content: str, mapping: dict) -> str: + for placeholder, original in mapping.items(): + masked_content = masked_content.replace(placeholder, original) + return masked_content + + def _deterministic_hash(self, value: str) -> str: + # Generate a deterministic hash using PBKDF2HMAC with SHA256 + salt = b'some_constant_salt' + kdf = PBKDF2HMAC( + algorithm=hashes.SHA256(), + length=32, + salt=salt, + iterations=100000, + backend=default_backend() + ) + hashed = kdf.derive(value.encode()) + return base64.urlsafe_b64encode(hashed).decode('utf-8') diff --git a/extract_thinker/masking/simple_placeholder_masking_strategy.py b/extract_thinker/masking/simple_placeholder_masking_strategy.py index 1707ac1..156a584 100644 --- a/extract_thinker/masking/simple_placeholder_masking_strategy.py +++ b/extract_thinker/masking/simple_placeholder_masking_strategy.py @@ -1,8 +1,8 @@ import re from extract_thinker.llm import LLM from extract_thinker.masking.abstract_masking_strategy import AbstractMaskingStrategy -from extract_thinker.models.MaskContract import MaskContract, MaskContractDict - +from extract_thinker.models.MaskContract import MaskContract +import asyncio class SimplePlaceholderMaskingStrategy(AbstractMaskingStrategy): MASK_PII_PROMPT = ( @@ -10,19 +10,43 @@ class SimplePlaceholderMaskingStrategy(AbstractMaskingStrategy): "Replace PII with placeholders in the format [TYPE#], e.g., [PERSON1], [ADDRESS1], [EMAIL1], etc. " "Do not mask numerical values or non-PII data. Ensure placeholders do not contain underscores or spaces." "Do not mask the key value result, they will be masked later." - "Dont return masked text, only the placeholder list." + "Don't return masked text, only the placeholder list." + "Values and Amounts(e.g $1000) are not PII values. The same for dates" "Provide a step-by-step reasoning when identifying PII." + "Always return ##Placeholder list: as part of the response" ) - MASK_PII_USER_PROMPT = """Please mask only Personally Identifiable Information (PII) in the following text. Replace PII with placeholders like [PERSON1], [ADDRESS1], [EMAIL1], [PHONE1], [TAXID1], etc. Do not use underscores or spaces in placeholders. Mask names, addresses, email addresses, phone numbers (including international formats), tax IDs, and other PII. Do not mask numerical values (except phone numbers and tax IDs), dates, amounts, or other non-PII information. Return the masked text and a list of placeholders with their original values. + MASK_PII_USER_PROMPT = """Task: Mask personally identifiable information (PII) in the provided text, replacing PII with placeholders like [PERSON1], [ADDRESS1], [EMAIL1], etc. Do not mask numerical values unless they are phone numbers or tax IDs. Return only the placeholder list with reasoning for each identified PII. + +Step 1: Reasoning & Thought Process +1. Analyze the text: + - Carefully examine each part of the text to determine if it contains PII. + - Focus on identifying common types of PII such as names, email addresses, phone numbers, tax IDs, and physical addresses. + - Ignore non-PII data such as dates, numerical values (except phone numbers and tax IDs), and any other non-sensitive information. + +2. Justify the decision: + - For each segment identified as PII, explain why it qualifies as such. + - Clearly differentiate between PII and non-PII elements. Provide reasoning for why certain elements are not PII. + +Step 2: Action +1. Mask PII: + - Replace each identified PII with an appropriate placeholder in the format [TYPE#] (e.g., [PERSON1], [ADDRESS1]). + - Do not mask any non-PII elements. + +2. Return placeholder list: + - Return a list of placeholders and their corresponding original values (but do not return the masked text). + - Ensure placeholders are formatted without underscores or spaces. + +**Important: Always include '##PLACEHOLDER LIST:' before the placeholder list.** -Here are some examples of correct masking: +Examples: Example 1: Original text: John Smith lives at 123 Main St, New York, NY 10001. His phone number is (555) 123-4567 and his SSN is 123-45-6789. For international calls, use +1-555-987-6543. He deposited $5,000 on 2023-07-15. -Placeholder list: +Output: +##PLACEHOLDER LIST: [PERSON1]: John Smith [ADDRESS1]: 123 Main St, New York, NY 10001 [PHONE1]: (555) 123-4567 @@ -33,7 +57,8 @@ class SimplePlaceholderMaskingStrategy(AbstractMaskingStrategy): Original text: Sarah Johnson ordered a laptop from TechStore on 2023-05-15. Her email is sarah.j@email.com and her work number is 1-800-555-1234. The company's EIN is 12-3456789. The total amount was $1,200. -Placeholder list: +Output: +##PLACEHOLDER LIST: [PERSON1]: Sarah Johnson [EMAIL1]: sarah.j@email.com [PHONE1]: 1-800-555-1234 @@ -45,12 +70,35 @@ class SimplePlaceholderMaskingStrategy(AbstractMaskingStrategy): Note: In this example, no masking is performed because there is no PII present. Numerical values (except phone numbers and tax IDs), project names, and dates are not considered PII. -Now, please mask the following text: +Example 4: +Original text: +John Doe transferred $5000 to Jane Smith on 2021-05-01. + +Step 1: Reasoning & Thought Process +Upon analyzing the text "John Doe transferred $5000 to Jane Smith on 2021-05-01.", we need to identify any PII present. + +1. Identifying PII Types: The common types of PII we're looking for are names (e.g., John Doe, Jane Smith), email addresses, phone numbers, tax IDs, and physical addresses. +2. Examining Text Segments: + - "John Doe" - This is a name, which is a type of PII. + - "Jane Smith" - This is another name, which is a type of PII. + - "$5000" - This is a financial transaction amount, not a phone number or tax ID, so it's not a type of PII in this context. Numerical values like this are often found in everyday text and aren't PII. + - "2021-05-01" - This is a date, which is not PII because it doesn't contain identifying information about a person. + +Step 2: Action +Based on the identified PII types and segments, we'll create placeholders for each PII found. + +1. Masking PII: We'll replace each identified PII with an appropriate placeholder in the format [TYPE#]. +2. Returning Placeholder List: We'll return a list of placeholders and their corresponding original values. + +Output: +##PLACEHOLDER LIST: +[PERSON1]: John Doe +[PERSON2]: Jane Smith Text to mask: {content} -Provide your step-by-step reasoning, the placeholder list with their original values, and the masked text. +Provide your step-by-step reasoning, and then return the placeholder list. **Remember to include '##PLACEHOLDER LIST:' before the list.** """ CONVERT_TO_JSON_PROMPT = ( @@ -93,6 +141,50 @@ class SimplePlaceholderMaskingStrategy(AbstractMaskingStrategy): {response_step1_content} ##JSON +""" + + RECONCILE_PROMPT = ( + "You are an AI assistant tasked with reconciling multiple placeholder lists generated from the same text. " + "Your goal is to produce a final, consolidated placeholder list that accurately identifies all PII in the text. " + "Consider all the provided placeholder lists and the original text, and determine the most accurate and comprehensive mapping. " + "Ensure that placeholders are in the format [TYPE#], without underscores or spaces." + ) + + RECONCILE_USER_PROMPT = """Given the original text and multiple placeholder lists generated by different analyses, reconcile these lists to produce a final, consolidated placeholder list. Your final list should include all PII elements that are consistently identified across the analyses, as well as any additional PII that you determine should be included upon reviewing the original text. + +Original Text: +{content} + +Placeholder Lists: +{placeholder_lists} + +Instructions: +- Review each placeholder list carefully. +- Compare the lists and identify common PII elements. +- Re-examine the original text to ensure no PII is missed. +- Consolidate the placeholders, ensuring unique numbering (e.g., [PERSON1], [PERSON2]). +- Return the final placeholder list in the same format, preceded by '##FINAL PLACEHOLDER LIST:'. + +**Remember to include '##FINAL PLACEHOLDER LIST:' before your consolidated list.** + +Example: + +If given: +Placeholder Lists: +- List 1: + [PERSON1]: John Doe + [EMAIL1]: john@example.com +- List 2: + [PERSON1]: John Doe + [PHONE1]: (555) 123-4567 + +Your output should be: +##FINAL PLACEHOLDER LIST: +[PERSON1]: John Doe +[EMAIL1]: john@example.com +[PHONE1]: (555) 123-4567 + +Now, please provide the final placeholder list. """ def __init__(self, llm: LLM): @@ -100,46 +192,104 @@ def __init__(self, llm: LLM): self.placeholder_counter = {} async def mask_content(self, content: str) -> MaskContract: - response_step1_content = await self._step1_mask_pii(content) - response_step2_content = await self._step2_convert_to_json(response_step1_content) - result = self._parse_mask_contract_dict(response_step2_content.mapping, content) + placeholder_lists = await self._step1_mask_pii(content) + final_mapping = await self._step2_reconcile_mappings(content, placeholder_lists) + result = self._parse_mask_contract_dict(final_mapping, content) return result - + def _parse_mask_contract_dict(self, mapping: dict, content: str) -> MaskContract: masked_text = content for placeholder, value in mapping.items(): masked_text = masked_text.replace(value, placeholder) return MaskContract(masked_text=masked_text, mapping=mapping) - async def _step1_mask_pii(self, content: str) -> str: - messages_step1 = [ - {"role": "system", "content": self.MASK_PII_PROMPT}, - {"role": "user", "content": self.MASK_PII_USER_PROMPT.format(content=content)}, - ] - response_step1 = self.llm.request(messages_step1) - response_step1_content = response_step1.choices[0].message.content - return response_step1_content + async def _step1_mask_pii(self, content: str) -> list: + MAX_RETRIES = 2 # Maximum number of retries per run - async def _step2_convert_to_json(self, response_step1_content: str) -> MaskContractDict: - messages_step2 = [ - {"role": "system", "content": self.CONVERT_TO_JSON_PROMPT}, + async def single_run(): + attempt = 0 + while attempt <= MAX_RETRIES: + messages_step1 = [ + {"role": "system", "content": self.MASK_PII_PROMPT}, + {"role": "user", "content": self.MASK_PII_USER_PROMPT.format(content=content)}, + ] + response_step1 = self.llm.request(messages_step1) + response_step1_content = response_step1.choices[0].message.content + + if "##PLACEHOLDER LIST:" not in response_step1_content: + # Retry with an explicit reminder + attempt += 1 + reminder_message = ( + "The previous response did not include '##PLACEHOLDER LIST:'. " + "Please make sure to include '##PLACEHOLDER LIST:' followed by the placeholder list." + ) + messages_step1.append({"role": "assistant", "content": response_step1_content}) + messages_step1.append({"role": "user", "content": reminder_message}) + continue # Retry the request + else: + # Extract the placeholder list + split_result = response_step1_content.split("##PLACEHOLDER LIST:") + placeholder_list = split_result[1].strip() + return placeholder_list + + # If all retries fail, raise an error + raise ValueError("Unable to obtain a valid '##PLACEHOLDER LIST:' after multiple attempts.") + + # Run the single_run function multiple times (e.g., 3 times) in parallel + runs = [single_run() for _ in range(3)] + results = await asyncio.gather(*runs) + + # Now, results is a list of placeholder list strings from each run + return results + + async def _step2_reconcile_mappings(self, content: str, placeholder_lists: list) -> dict: + # Prepare the placeholder lists string for the prompt + placeholder_lists_str = "" + for idx, plist in enumerate(placeholder_lists, 1): + placeholder_lists_str += f"- List {idx}:\n{plist}\n" + + messages_reconcile = [ + {"role": "system", "content": self.RECONCILE_PROMPT}, { "role": "user", - "content": self.CONVERT_TO_JSON_USER_PROMPT.format( - response_step1_content=response_step1_content, + "content": self.RECONCILE_USER_PROMPT.format( + content=content, + placeholder_lists=placeholder_lists_str, ), }, ] - response_step2 = self.llm.request(messages_step2, MaskContractDict) - return response_step2 - - def _validate_placeholders(self, mask_contract: MaskContract): - placeholder_pattern = re.compile(r'^\[[A-Za-z]+[0-9]*\]$') - for placeholder in mask_contract.mapping.keys(): - if not placeholder_pattern.match(placeholder): - raise ValueError(f"Invalid placeholder format: {placeholder}") + + # Send the reconciliation request to the LLM + response_reconcile = self.llm.request(messages_reconcile) + response_reconcile_content = response_reconcile.choices[0].message.content + + if "##FINAL PLACEHOLDER LIST:" not in response_reconcile_content: + raise ValueError("The final placeholder list was not found in the LLM's response.") + + # Extract the final placeholder list + split_result = response_reconcile_content.split("##FINAL PLACEHOLDER LIST:") + final_placeholder_list = split_result[1].strip() + + # Parse the final placeholder list into a mapping + final_mapping = self._parse_placeholder_list(final_placeholder_list) + return final_mapping + + def _parse_placeholder_list(self, placeholder_list_str: str) -> dict: + mapping = {} + lines = placeholder_list_str.strip().split('\n') + for line in lines: + if line.strip() == '': + continue + # Expected format: [PLACEHOLDER]: original_value + parts = line.split(':', 1) + if len(parts) != 2: + continue # or raise an error + placeholder = parts[0].strip() + original_value = parts[1].strip() + mapping[placeholder] = original_value + return mapping def unmask_content(self, masked_content: str, mapping: dict) -> str: for placeholder, original in mapping.items(): masked_content = masked_content.replace(placeholder, original) - return masked_content + return masked_content \ No newline at end of file diff --git a/extract_thinker/process.py b/extract_thinker/process.py index 6875fad..5de0ca7 100644 --- a/extract_thinker/process.py +++ b/extract_thinker/process.py @@ -1,6 +1,7 @@ import asyncio from typing import IO, Any, Dict, List, Optional, Union from extract_thinker.extractor import Extractor +from extract_thinker.masking.deterministic_hashing_masking_strategy import DeterministicHashingMaskingStrategy from extract_thinker.models.classification import Classification from extract_thinker.document_loader.document_loader import DocumentLoader from extract_thinker.models.classification_tree import ClassificationTree @@ -28,6 +29,7 @@ class ClassificationStrategy(Enum): class MaskingStrategy(Enum): SIMPLE_PLACEHOLDER = "simple_placeholder" MOCKED_DATA = "mocked_data" + DETERMINISTIC_HASHING = "deterministic_hashing" class Process: def __init__(self): @@ -53,6 +55,8 @@ def add_masking_llm(self, model: Optional[str] = None, strategy: Optional[MaskCo self.masking_strategy = SimplePlaceholderMaskingStrategy(self.llm) elif strategy == MaskingStrategy.MOCKED_DATA: self.masking_strategy = MockedDataMaskingStrategy(self.llm) + elif strategy == MaskingStrategy.DETERMINISTIC_HASHING: + self.masking_strategy = DeterministicHashingMaskingStrategy(self.llm) async def mask_content(self, content: str) -> MaskContract: if self.masking_strategy is None: diff --git a/poetry.lock b/poetry.lock index ab877a3..422a23e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "aiohttp" @@ -238,8 +238,8 @@ pathspec = ">=0.9.0,<1" platformdirs = ">=2" tomli = ">=0.2.6,<2.0.0" typing-extensions = [ - {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}, {version = ">=3.10.0.0,<3.10.0.1 || >3.10.0.1", markers = "python_version >= \"3.10\""}, + {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}, ] [package.extras] @@ -283,8 +283,8 @@ files = [ jmespath = ">=0.7.1,<2.0.0" python-dateutil = ">=2.1,<3.0.0" urllib3 = [ - {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""}, {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""}, + {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""}, ] [package.extras] @@ -312,6 +312,85 @@ files = [ {file = "certifi-2024.6.2.tar.gz", hash = "sha256:3cd43f1c6fa7dedc5899d69d3ad0398fd018ad1a17fba83ddaf78aa46c747516"}, ] +[[package]] +name = "cffi" +version = "1.17.1" +description = "Foreign Function Interface for Python calling C code." +optional = false +python-versions = ">=3.8" +files = [ + {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"}, + {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edae79245293e15384b51f88b00613ba9f7198016a5948b5dddf4917d4d26382"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45398b671ac6d70e67da8e4224a065cec6a93541bb7aebe1b198a61b58c7b702"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad9413ccdeda48c5afdae7e4fa2192157e991ff761e7ab8fdd8926f40b160cc3"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5da5719280082ac6bd9aa7becb3938dc9f9cbd57fac7d2871717b1feb0902ab6"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bb1a08b8008b281856e5971307cc386a8e9c5b625ac297e853d36da6efe9c17"}, + {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:045d61c734659cc045141be4bae381a41d89b741f795af1dd018bfb532fd0df8"}, + {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6883e737d7d9e4899a8a695e00ec36bd4e5e4f18fabe0aca0efe0a4b44cdb13e"}, + {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6b8b4a92e1c65048ff98cfe1f735ef8f1ceb72e3d5f0c25fdb12087a23da22be"}, + {file = "cffi-1.17.1-cp310-cp310-win32.whl", hash = "sha256:c9c3d058ebabb74db66e431095118094d06abf53284d9c81f27300d0e0d8bc7c"}, + {file = "cffi-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:0f048dcf80db46f0098ccac01132761580d28e28bc0f78ae0d58048063317e15"}, + {file = "cffi-1.17.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a45e3c6913c5b87b3ff120dcdc03f6131fa0065027d0ed7ee6190736a74cd401"}, + {file = "cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30c5e0cb5ae493c04c8b42916e52ca38079f1b235c2f8ae5f4527b963c401caf"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f75c7ab1f9e4aca5414ed4d8e5c0e303a34f4421f8a0d47a4d019ceff0ab6af4"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1ed2dd2972641495a3ec98445e09766f077aee98a1c896dcb4ad0d303628e41"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:46bf43160c1a35f7ec506d254e5c890f3c03648a4dbac12d624e4490a7046cd1"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a24ed04c8ffd54b0729c07cee15a81d964e6fee0e3d4d342a27b020d22959dc6"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:610faea79c43e44c71e1ec53a554553fa22321b65fae24889706c0a84d4ad86d"}, + {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a9b15d491f3ad5d692e11f6b71f7857e7835eb677955c00cc0aefcd0669adaf6"}, + {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:de2ea4b5833625383e464549fec1bc395c1bdeeb5f25c4a3a82b5a8c756ec22f"}, + {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fc48c783f9c87e60831201f2cce7f3b2e4846bf4d8728eabe54d60700b318a0b"}, + {file = "cffi-1.17.1-cp311-cp311-win32.whl", hash = "sha256:85a950a4ac9c359340d5963966e3e0a94a676bd6245a4b55bc43949eee26a655"}, + {file = "cffi-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:caaf0640ef5f5517f49bc275eca1406b0ffa6aa184892812030f04c2abf589a0"}, + {file = "cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4"}, + {file = "cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93"}, + {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3"}, + {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8"}, + {file = "cffi-1.17.1-cp312-cp312-win32.whl", hash = "sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65"}, + {file = "cffi-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903"}, + {file = "cffi-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e"}, + {file = "cffi-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd"}, + {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed"}, + {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9"}, + {file = "cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d"}, + {file = "cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a"}, + {file = "cffi-1.17.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:636062ea65bd0195bc012fea9321aca499c0504409f413dc88af450b57ffd03b"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7eac2ef9b63c79431bc4b25f1cd649d7f061a28808cbc6c47b534bd789ef964"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e221cf152cff04059d011ee126477f0d9588303eb57e88923578ace7baad17f9"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31000ec67d4221a71bd3f67df918b1f88f676f1c3b535a7eb473255fdc0b83fc"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f17be4345073b0a7b8ea599688f692ac3ef23ce28e5df79c04de519dbc4912c"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2b1fac190ae3ebfe37b979cc1ce69c81f4e4fe5746bb401dca63a9062cdaf1"}, + {file = "cffi-1.17.1-cp38-cp38-win32.whl", hash = "sha256:7596d6620d3fa590f677e9ee430df2958d2d6d6de2feeae5b20e82c00b76fbf8"}, + {file = "cffi-1.17.1-cp38-cp38-win_amd64.whl", hash = "sha256:78122be759c3f8a014ce010908ae03364d00a1f81ab5c7f4a7a5120607ea56e1"}, + {file = "cffi-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16"}, + {file = "cffi-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca74b8dbe6e8e8263c0ffd60277de77dcee6c837a3d0881d8c1ead7268c9e576"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7f5baafcc48261359e14bcd6d9bff6d4b28d9103847c9e136694cb0501aef87"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98e3969bcff97cae1b2def8ba499ea3d6f31ddfdb7635374834cf89a1a08ecf0"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdf5ce3acdfd1661132f2a9c19cac174758dc2352bfe37d98aa7512c6b7178b3"}, + {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9755e4345d1ec879e3849e62222a18c7174d65a6a92d5b346b1863912168b595"}, + {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f1e22e8c4419538cb197e4dd60acc919d7696e5ef98ee4da4e01d3f8cfa4cc5a"}, + {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c03e868a0b3bc35839ba98e74211ed2b05d2119be4e8a0f224fba9384f1fe02e"}, + {file = "cffi-1.17.1-cp39-cp39-win32.whl", hash = "sha256:e31ae45bc2e29f6b2abd0de1cc3b9d5205aa847cafaecb8af1476a609a2f6eb7"}, + {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"}, + {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"}, +] + +[package.dependencies] +pycparser = "*" + [[package]] name = "charset-normalizer" version = "3.3.2" @@ -436,6 +515,55 @@ files = [ {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +[[package]] +name = "cryptography" +version = "43.0.3" +description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." +optional = false +python-versions = ">=3.7" +files = [ + {file = "cryptography-43.0.3-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:bf7a1932ac4176486eab36a19ed4c0492da5d97123f1406cf15e41b05e787d2e"}, + {file = "cryptography-43.0.3-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:63efa177ff54aec6e1c0aefaa1a241232dcd37413835a9b674b6e3f0ae2bfd3e"}, + {file = "cryptography-43.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e1ce50266f4f70bf41a2c6dc4358afadae90e2a1e5342d3c08883df1675374f"}, + {file = "cryptography-43.0.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:443c4a81bb10daed9a8f334365fe52542771f25aedaf889fd323a853ce7377d6"}, + {file = "cryptography-43.0.3-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:74f57f24754fe349223792466a709f8e0c093205ff0dca557af51072ff47ab18"}, + {file = "cryptography-43.0.3-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9762ea51a8fc2a88b70cf2995e5675b38d93bf36bd67d91721c309df184f49bd"}, + {file = "cryptography-43.0.3-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:81ef806b1fef6b06dcebad789f988d3b37ccaee225695cf3e07648eee0fc6b73"}, + {file = "cryptography-43.0.3-cp37-abi3-win32.whl", hash = "sha256:cbeb489927bd7af4aa98d4b261af9a5bc025bd87f0e3547e11584be9e9427be2"}, + {file = "cryptography-43.0.3-cp37-abi3-win_amd64.whl", hash = "sha256:f46304d6f0c6ab8e52770addfa2fc41e6629495548862279641972b6215451cd"}, + {file = "cryptography-43.0.3-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:8ac43ae87929a5982f5948ceda07001ee5e83227fd69cf55b109144938d96984"}, + {file = "cryptography-43.0.3-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:846da004a5804145a5f441b8530b4bf35afbf7da70f82409f151695b127213d5"}, + {file = "cryptography-43.0.3-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f996e7268af62598f2fc1204afa98a3b5712313a55c4c9d434aef49cadc91d4"}, + {file = "cryptography-43.0.3-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:f7b178f11ed3664fd0e995a47ed2b5ff0a12d893e41dd0494f406d1cf555cab7"}, + {file = "cryptography-43.0.3-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:c2e6fc39c4ab499049df3bdf567f768a723a5e8464816e8f009f121a5a9f4405"}, + {file = "cryptography-43.0.3-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:e1be4655c7ef6e1bbe6b5d0403526601323420bcf414598955968c9ef3eb7d16"}, + {file = "cryptography-43.0.3-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:df6b6c6d742395dd77a23ea3728ab62f98379eff8fb61be2744d4679ab678f73"}, + {file = "cryptography-43.0.3-cp39-abi3-win32.whl", hash = "sha256:d56e96520b1020449bbace2b78b603442e7e378a9b3bd68de65c782db1507995"}, + {file = "cryptography-43.0.3-cp39-abi3-win_amd64.whl", hash = "sha256:0c580952eef9bf68c4747774cde7ec1d85a6e61de97281f2dba83c7d2c806362"}, + {file = "cryptography-43.0.3-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:d03b5621a135bffecad2c73e9f4deb1a0f977b9a8ffe6f8e002bf6c9d07b918c"}, + {file = "cryptography-43.0.3-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a2a431ee15799d6db9fe80c82b055bae5a752bef645bba795e8e52687c69efe3"}, + {file = "cryptography-43.0.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:281c945d0e28c92ca5e5930664c1cefd85efe80e5c0d2bc58dd63383fda29f83"}, + {file = "cryptography-43.0.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f18c716be16bc1fea8e95def49edf46b82fccaa88587a45f8dc0ff6ab5d8e0a7"}, + {file = "cryptography-43.0.3-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:4a02ded6cd4f0a5562a8887df8b3bd14e822a90f97ac5e544c162899bc467664"}, + {file = "cryptography-43.0.3-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:53a583b6637ab4c4e3591a15bc9db855b8d9dee9a669b550f311480acab6eb08"}, + {file = "cryptography-43.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1ec0bcf7e17c0c5669d881b1cd38c4972fade441b27bda1051665faaa89bdcaa"}, + {file = "cryptography-43.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2ce6fae5bdad59577b44e4dfed356944fbf1d925269114c28be377692643b4ff"}, + {file = "cryptography-43.0.3.tar.gz", hash = "sha256:315b9001266a492a6ff443b61238f956b214dbec9910a081ba5b6646a055a805"}, +] + +[package.dependencies] +cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""} + +[package.extras] +docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"] +docstest = ["pyenchant (>=1.6.11)", "readme-renderer", "sphinxcontrib-spelling (>=4.0.1)"] +nox = ["nox"] +pep8test = ["check-sdist", "click", "mypy", "ruff"] +sdist = ["build"] +ssh = ["bcrypt (>=3.1.5)"] +test = ["certifi", "cryptography-vectors (==43.0.3)", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] +test-randomorder = ["pytest-randomly"] + [[package]] name = "distro" version = "1.9.0" @@ -655,12 +783,12 @@ files = [ google-auth = ">=2.14.1,<3.0.dev0" googleapis-common-protos = ">=1.56.2,<2.0.dev0" grpcio = [ - {version = ">=1.33.2,<2.0dev", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""}, {version = ">=1.49.1,<2.0dev", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, + {version = ">=1.33.2,<2.0dev", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""}, ] grpcio-status = [ - {version = ">=1.33.2,<2.0.dev0", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""}, {version = ">=1.49.1,<2.0.dev0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, + {version = ">=1.33.2,<2.0.dev0", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""}, ] proto-plus = ">=1.22.3,<2.0.0dev" protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0" @@ -1772,6 +1900,17 @@ files = [ {file = "pycodestyle-2.7.0.tar.gz", hash = "sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef"}, ] +[[package]] +name = "pycparser" +version = "2.22" +description = "C parser in Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"}, + {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, +] + [[package]] name = "pydantic" version = "2.7.4" @@ -2712,4 +2851,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "80be233c997072eb67a8bba585fe04e4d97c968504bf7e165e58fd0a62f1a224" +content-hash = "5d0075a7afa75b4fbd500e9b3e9a28598afec92acd1918a624f03a5b24f49c89" diff --git a/pyproject.toml b/pyproject.toml index 2331ccd..559b33f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ pypdf2 = "^3.0.1" azure-ai-formrecognizer = "^3.3.3" google-cloud-documentai = "^2.29.1" boto3 = "^1.34.161" +cryptography = "^43.0.3" [tool.poetry.dev-dependencies] flake8 = "^3.9.2" diff --git a/tests/test_process.py b/tests/test_process.py index 4a8ba2e..5f86cf0 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -1,11 +1,8 @@ import os -import pytest -from dotenv import load_dotenv - import sys sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) - -from extract_thinker.process import Process +from dotenv import load_dotenv +from extract_thinker.process import MaskingStrategy, Process from extract_thinker.document_loader.document_loader_pypdf import DocumentLoaderPyPdf import asyncio @@ -126,5 +123,19 @@ def test_simple_use_case(): unmasked_content = process.unmask_content(result.masked_text, result.mapping) assert unmasked_content == test_text +def test_deterministic_hashing(): + # Arrange + process = Process() + process.add_masking_llm("groq/llama-3.2-11b-text-preview", MaskingStrategy.DETERMINISTIC_HASHING) + + test_text = "John Doe transferred $5000 to Jane Smith on 2021-05-01." + + # Act + result = asyncio.run(process.mask_content(test_text)) + + # Assert + assert result.masked_text is not None + assert result.mapping is not None + if __name__ == "__main__": - asyncio.run(test_simple_use_case()) \ No newline at end of file + test_mask() \ No newline at end of file From e1d6936b2cadcd4496d0f4e8c28216975706353c Mon Sep 17 00:00:00 2001 From: julio Almeida Date: Wed, 23 Oct 2024 12:20:41 +0200 Subject: [PATCH 09/12] sanity commit --- .../simple_placeholder_masking_strategy.py | 336 +++++++----------- extract_thinker/process.py | 2 +- tests/test_process.py | 39 +- 3 files changed, 147 insertions(+), 230 deletions(-) diff --git a/extract_thinker/masking/simple_placeholder_masking_strategy.py b/extract_thinker/masking/simple_placeholder_masking_strategy.py index 156a584..eb12d76 100644 --- a/extract_thinker/masking/simple_placeholder_masking_strategy.py +++ b/extract_thinker/masking/simple_placeholder_masking_strategy.py @@ -9,40 +9,12 @@ class SimplePlaceholderMaskingStrategy(AbstractMaskingStrategy): "You are an AI assistant that masks only Personally Identifiable Information (PII) in text. " "Replace PII with placeholders in the format [TYPE#], e.g., [PERSON1], [ADDRESS1], [EMAIL1], etc. " "Do not mask numerical values or non-PII data. Ensure placeholders do not contain underscores or spaces." - "Do not mask the key value result, they will be masked later." - "Don't return masked text, only the placeholder list." - "Values and Amounts(e.g $1000) are not PII values. The same for dates" - "Provide a step-by-step reasoning when identifying PII." - "Always return ##Placeholder list: as part of the response" ) - MASK_PII_USER_PROMPT = """Task: Mask personally identifiable information (PII) in the provided text, replacing PII with placeholders like [PERSON1], [ADDRESS1], [EMAIL1], etc. Do not mask numerical values unless they are phone numbers or tax IDs. Return only the placeholder list with reasoning for each identified PII. - -Step 1: Reasoning & Thought Process -1. Analyze the text: - - Carefully examine each part of the text to determine if it contains PII. - - Focus on identifying common types of PII such as names, email addresses, phone numbers, tax IDs, and physical addresses. - - Ignore non-PII data such as dates, numerical values (except phone numbers and tax IDs), and any other non-sensitive information. - -2. Justify the decision: - - For each segment identified as PII, explain why it qualifies as such. - - Clearly differentiate between PII and non-PII elements. Provide reasoning for why certain elements are not PII. - -Step 2: Action -1. Mask PII: - - Replace each identified PII with an appropriate placeholder in the format [TYPE#] (e.g., [PERSON1], [ADDRESS1]). - - Do not mask any non-PII elements. - -2. Return placeholder list: - - Return a list of placeholders and their corresponding original values (but do not return the masked text). - - Ensure placeholders are formatted without underscores or spaces. - -**Important: Always include '##PLACEHOLDER LIST:' before the placeholder list.** - -Examples: + MASK_PII_USER_PROMPT = """You are an AI assistant that masks only Personally Identifiable Information (PII) in text. Replace PII with placeholders in the format [TYPE#], e.g., [PERSON1], [ADDRESS1], [EMAIL1], etc. Do not mask numerical values or non-PII data. Example 1: -Original text: +Input: John Smith lives at 123 Main St, New York, NY 10001. His phone number is (555) 123-4567 and his SSN is 123-45-6789. For international calls, use +1-555-987-6543. He deposited $5,000 on 2023-07-15. Output: @@ -53,165 +25,57 @@ class SimplePlaceholderMaskingStrategy(AbstractMaskingStrategy): [TAXID1]: 123-45-6789 [PHONE2]: +1-555-987-6543 -Example 2: -Original text: -Sarah Johnson ordered a laptop from TechStore on 2023-05-15. Her email is sarah.j@email.com and her work number is 1-800-555-1234. The company's EIN is 12-3456789. The total amount was $1,200. - -Output: -##PLACEHOLDER LIST: -[PERSON1]: Sarah Johnson -[EMAIL1]: sarah.j@email.com -[PHONE1]: 1-800-555-1234 -[TAXID1]: 12-3456789 - -Example 3 (Demonstrating what NOT to mask): -Original text: -The company's revenue was $10,000,000 last year. Project XYZ has a budget of $500,000 and is due on 2023-12-31. The office can accommodate 50 employees. - -Note: In this example, no masking is performed because there is no PII present. Numerical values (except phone numbers and tax IDs), project names, and dates are not considered PII. - -Example 4: -Original text: -John Doe transferred $5000 to Jane Smith on 2021-05-01. - -Step 1: Reasoning & Thought Process -Upon analyzing the text "John Doe transferred $5000 to Jane Smith on 2021-05-01.", we need to identify any PII present. - -1. Identifying PII Types: The common types of PII we're looking for are names (e.g., John Doe, Jane Smith), email addresses, phone numbers, tax IDs, and physical addresses. -2. Examining Text Segments: - - "John Doe" - This is a name, which is a type of PII. - - "Jane Smith" - This is another name, which is a type of PII. - - "$5000" - This is a financial transaction amount, not a phone number or tax ID, so it's not a type of PII in this context. Numerical values like this are often found in everyday text and aren't PII. - - "2021-05-01" - This is a date, which is not PII because it doesn't contain identifying information about a person. - -Step 2: Action -Based on the identified PII types and segments, we'll create placeholders for each PII found. - -1. Masking PII: We'll replace each identified PII with an appropriate placeholder in the format [TYPE#]. -2. Returning Placeholder List: We'll return a list of placeholders and their corresponding original values. - -Output: -##PLACEHOLDER LIST: -[PERSON1]: John Doe -[PERSON2]: Jane Smith - -Text to mask: +Input: {content} -Provide your step-by-step reasoning, and then return the placeholder list. **Remember to include '##PLACEHOLDER LIST:' before the list.** -""" - - CONVERT_TO_JSON_PROMPT = ( - "You are an AI assistant that converts placeholder lists into JSON format. " - "Ensure that placeholders are strictly in the format [TYPE#], without underscores or spaces." - ) - - CONVERT_TO_JSON_USER_PROMPT = """Convert the following placeholder lists into a JSON format. For each example, the JSON should have a single key: "mapping" (a dictionary of placeholders and their original PII values). Ensure placeholders are in the correct format [TYPE#], without underscores or spaces. - -Example 1: -Placeholder list: -[PERSON1]: John Smith -[ADDRESS1]: 123 Main St, New York, NY 10001 -[PHONE1]: (555) 123-4567 - -Output: -{{ - "mapping": {{ - "[PERSON1]": "John Smith", - "[ADDRESS1]": "123 Main St, New York, NY 10001", - "[PHONE1]": "(555) 123-4567" - }} -}} - -Example 2: -Placeholder list: -[PERSON1]: Sarah Johnson -[EMAIL1]: sarah.j@email.com - Output: -{{ - "mapping": {{ - "[PERSON1]": "Sarah Johnson", - "[EMAIL1]": "sarah.j@email.com" - }} -}} - -Now, please convert the following placeholder list into JSON format: - -{response_step1_content} - -##JSON """ - RECONCILE_PROMPT = ( - "You are an AI assistant tasked with reconciling multiple placeholder lists generated from the same text. " - "Your goal is to produce a final, consolidated placeholder list that accurately identifies all PII in the text. " - "Consider all the provided placeholder lists and the original text, and determine the most accurate and comprehensive mapping. " - "Ensure that placeholders are in the format [TYPE#], without underscores or spaces." - ) - - RECONCILE_USER_PROMPT = """Given the original text and multiple placeholder lists generated by different analyses, reconcile these lists to produce a final, consolidated placeholder list. Your final list should include all PII elements that are consistently identified across the analyses, as well as any additional PII that you determine should be included upon reviewing the original text. + def __init__(self, llm: LLM): + super().__init__(llm) + self.global_mapping = {} # Final mapping of placeholders to PII values + self.pii_to_placeholder = {} # Mapping of PII values to placeholders -Original Text: -{content} + async def mask_content(self, content: str) -> MaskContract: + paragraphs = self._split_into_paragraphs(content) + masked_paragraphs = [] + all_mappings = [] -Placeholder Lists: -{placeholder_lists} - -Instructions: -- Review each placeholder list carefully. -- Compare the lists and identify common PII elements. -- Re-examine the original text to ensure no PII is missed. -- Consolidate the placeholders, ensuring unique numbering (e.g., [PERSON1], [PERSON2]). -- Return the final placeholder list in the same format, preceded by '##FINAL PLACEHOLDER LIST:'. - -**Remember to include '##FINAL PLACEHOLDER LIST:' before your consolidated list.** - -Example: - -If given: -Placeholder Lists: -- List 1: - [PERSON1]: John Doe - [EMAIL1]: john@example.com -- List 2: - [PERSON1]: John Doe - [PHONE1]: (555) 123-4567 - -Your output should be: -##FINAL PLACEHOLDER LIST: -[PERSON1]: John Doe -[EMAIL1]: john@example.com -[PHONE1]: (555) 123-4567 + for paragraph in paragraphs: + placeholder_list = await self._process_paragraph(paragraph) + mapping = self._parse_placeholder_list(placeholder_list) + all_mappings.append(mapping) + # Mask the paragraph using the mapping + masked_paragraph = self._apply_masking(paragraph, mapping) + masked_paragraphs.append(masked_paragraph) -Now, please provide the final placeholder list. -""" + # After processing all paragraphs, reconcile the mappings + self._reconcile_mappings(all_mappings) - def __init__(self, llm: LLM): - super().__init__(llm) - self.placeholder_counter = {} + # Combine masked paragraphs back into the final masked text + masked_text = '\n\n'.join(masked_paragraphs) - async def mask_content(self, content: str) -> MaskContract: - placeholder_lists = await self._step1_mask_pii(content) - final_mapping = await self._step2_reconcile_mappings(content, placeholder_lists) - result = self._parse_mask_contract_dict(final_mapping, content) + result = MaskContract(masked_text=masked_text, mapping=self.global_mapping) return result - def _parse_mask_contract_dict(self, mapping: dict, content: str) -> MaskContract: - masked_text = content - for placeholder, value in mapping.items(): - masked_text = masked_text.replace(value, placeholder) - return MaskContract(masked_text=masked_text, mapping=mapping) - - async def _step1_mask_pii(self, content: str) -> list: - MAX_RETRIES = 2 # Maximum number of retries per run + def _split_into_paragraphs(self, text: str) -> list: + # Split text into paragraphs based on various newline patterns + paragraphs = re.split(r'\n{2,}|\r\n{2,}|\r{2,}', text.strip()) + # Further split paragraphs if they contain single newlines + result = [] + for paragraph in paragraphs: + sub_paragraphs = paragraph.split('\n') + result.extend(sub_para.strip() for sub_para in sub_paragraphs if sub_para.strip()) + return result + async def _process_paragraph(self, paragraph: str) -> str: async def single_run(): + MAX_SINGLE_RUN_RETRIES = 2 # Maximum number of retries per single run attempt = 0 - while attempt <= MAX_RETRIES: + while attempt <= MAX_SINGLE_RUN_RETRIES: messages_step1 = [ {"role": "system", "content": self.MASK_PII_PROMPT}, - {"role": "user", "content": self.MASK_PII_USER_PROMPT.format(content=content)}, + {"role": "user", "content": self.MASK_PII_USER_PROMPT.format(content=paragraph)}, ] response_step1 = self.llm.request(messages_step1) response_step1_content = response_step1.choices[0].message.content @@ -235,44 +99,67 @@ async def single_run(): # If all retries fail, raise an error raise ValueError("Unable to obtain a valid '##PLACEHOLDER LIST:' after multiple attempts.") - # Run the single_run function multiple times (e.g., 3 times) in parallel - runs = [single_run() for _ in range(3)] - results = await asyncio.gather(*runs) - - # Now, results is a list of placeholder list strings from each run - return results - - async def _step2_reconcile_mappings(self, content: str, placeholder_lists: list) -> dict: - # Prepare the placeholder lists string for the prompt - placeholder_lists_str = "" - for idx, plist in enumerate(placeholder_lists, 1): - placeholder_lists_str += f"- List {idx}:\n{plist}\n" - - messages_reconcile = [ - {"role": "system", "content": self.RECONCILE_PROMPT}, - { - "role": "user", - "content": self.RECONCILE_USER_PROMPT.format( - content=content, - placeholder_lists=placeholder_lists_str, - ), - }, + MAX_RETRIES = 10 # Maximum number of retries for inconsistent results + retry_count = 0 + + while retry_count < MAX_RETRIES: + # Run two parallel requests + initial_runs = [single_run() for _ in range(2)] + initial_results = await asyncio.gather(*initial_runs) + + # Compare the initial two results + if initial_results[0] != initial_results[1]: + retry_count += 1 + continue # Retry due to inconsistency + + # Reconcile all mappings + all_results = initial_results + final_placeholder_list = self._reconcile_placeholder_lists(all_results, paragraph) + return final_placeholder_list + + # If all retries fail, raise an error + raise ValueError("Unable to obtain consistent placeholder lists after maximum retries.") + + + def _reconcile_placeholder_lists(self, placeholder_lists: list, paragraph: str) -> str: + # Parse each placeholder list into a mapping + mappings = [self._parse_placeholder_list(plist) for plist in placeholder_lists] + + # Collect counts of original values and their PII types + original_value_counts = {} + for mapping in mappings: + for placeholder, original_value in mapping.items(): + # Extract PII type from placeholder, e.g., [PERSON1] -> PERSON + m = re.match(r'\[([A-Za-z]+)[0-9]*\]', placeholder) + if m: + pii_type = m.group(1) + key = (original_value, pii_type) + if key not in original_value_counts: + original_value_counts[key] = 0 + original_value_counts[key] += 1 + + # Keep original values that appear in all lists + required_count = len(placeholder_lists) + final_items = [ + (original_value, pii_type) + for (original_value, pii_type), count in original_value_counts.items() + if count == required_count ] - # Send the reconciliation request to the LLM - response_reconcile = self.llm.request(messages_reconcile) - response_reconcile_content = response_reconcile.choices[0].message.content + # If no items appear consistently across all lists, accept all items + if not final_items: + for (original_value, pii_type), count in original_value_counts.items(): + final_items.append((original_value, pii_type)) - if "##FINAL PLACEHOLDER LIST:" not in response_reconcile_content: - raise ValueError("The final placeholder list was not found in the LLM's response.") + # Create a mapping for this paragraph without assigning placeholders yet + paragraph_mapping = {} + for original_value, pii_type in final_items: + paragraph_mapping[original_value] = pii_type - # Extract the final placeholder list - split_result = response_reconcile_content.split("##FINAL PLACEHOLDER LIST:") - final_placeholder_list = split_result[1].strip() + # Return the paragraph mapping as a placeholder list string + placeholder_list = '\n'.join([f'{pii_type}: {original_value}' for original_value, pii_type in paragraph_mapping.items()]) - # Parse the final placeholder list into a mapping - final_mapping = self._parse_placeholder_list(final_placeholder_list) - return final_mapping + return placeholder_list def _parse_placeholder_list(self, placeholder_list_str: str) -> dict: mapping = {} @@ -289,7 +176,44 @@ def _parse_placeholder_list(self, placeholder_list_str: str) -> dict: mapping[placeholder] = original_value return mapping + def _reconcile_mappings(self, all_mappings: list): + # Collect all PII values and their types + pii_items = {} + for mapping in all_mappings: + for placeholder, original_value in mapping.items(): + m = re.match(r'\[([A-Za-z]+)[0-9]*\]', placeholder) + if m: + pii_type = m.group(1) + if original_value not in pii_items: + pii_items[original_value] = pii_type + else: + # If the same PII value has different types, decide how to handle it + # For simplicity, we'll keep the first type encountered + pass + + # Assign placeholders to PII values + placeholder_counters = {} + for original_value, pii_type in pii_items.items(): + if pii_type not in placeholder_counters: + placeholder_counters[pii_type] = 1 + else: + placeholder_counters[pii_type] += 1 + placeholder = f'[{pii_type}{placeholder_counters[pii_type]}]' + self.global_mapping[placeholder] = original_value + self.pii_to_placeholder[original_value] = placeholder + + def _apply_masking(self, text: str, mapping: dict) -> str: + masked_text = text + # Use the global mapping to ensure consistency across paragraphs + for original_value, pii_type in mapping.items(): + placeholder = self.pii_to_placeholder.get(original_value) + if placeholder: + masked_text = masked_text.replace(original_value, placeholder) + return masked_text + def unmask_content(self, masked_content: str, mapping: dict) -> str: - for placeholder, original in mapping.items(): + # Sort placeholders by length to avoid partial replacements + sorted_mapping = dict(sorted(mapping.items(), key=lambda x: -len(x[0]))) + for placeholder, original in sorted_mapping.items(): masked_content = masked_content.replace(placeholder, original) return masked_content \ No newline at end of file diff --git a/extract_thinker/process.py b/extract_thinker/process.py index 5de0ca7..20f19f8 100644 --- a/extract_thinker/process.py +++ b/extract_thinker/process.py @@ -45,7 +45,7 @@ def __init__(self): self.masking_strategy: Optional[AbstractMaskingStrategy] = None self.llm: Optional[LLM] = None - def add_masking_llm(self, model: Optional[str] = None, strategy: Optional[MaskContract] = MaskingStrategy.SIMPLE_PLACEHOLDER): + def add_masking_llm(self, model: Optional[str | LLM] = None, strategy: Optional[MaskContract] = MaskingStrategy.SIMPLE_PLACEHOLDER): if isinstance(model, LLM): self.llm = model elif model is not None: diff --git a/tests/test_process.py b/tests/test_process.py index 5f86cf0..958dddf 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -1,8 +1,9 @@ import os import sys sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from extract_thinker.llm import LLM from dotenv import load_dotenv -from extract_thinker.process import MaskingStrategy, Process +from extract_thinker.process import Process from extract_thinker.document_loader.document_loader_pypdf import DocumentLoaderPyPdf import asyncio @@ -16,20 +17,12 @@ def test_mask(): process = Process() process.load_document_loader(DocumentLoaderPyPdf()) process.load_file(test_file_path) - process.add_masking_llm("groq/llama-3.2-11b-text-preview") + # process.add_masking_llm("groq/llama-3.2-3b-preview") + llm = LLM("groq/llama-3.2-11b-text-preview") + process.add_masking_llm(llm) # Act - test_text = ( - "Mr. George Collins lives at 123 Main St, Anytown, USA 12345. His phone number is 555-1234. " - "Jane Smith resides at 456 Elm Avenue, Othercity, State 67890, and can be reached at (987) 654-3210. " - "The company's CEO, Robert Johnson, has an office at 789 Corporate Blvd, Suite 500, Bigcity, State 13579. " - "For customer service, call 1-800-555-9876 or email support@example.com. " - "Sarah Lee, our HR manager, can be contacted at 444-333-2222 or sarah.lee@company.com. " - "The project budget is $250,000, with an additional $50,000 allocated for contingencies. " - "Monthly maintenance costs are estimated at $3,500. " - "For international clients, please use +1-555-987-6543. " - "Our tax ID number is 12-3456789." - ) + test_text = "Mr. George Collins lives at 123 Main St, Anytown, USA 12345.\n His phone number is 555-1234.\nJane Smith resides at 456 Elm Avenue, Othercity, State 67890, and can be reached at (987) 654-3210.\nThe company's CEO, Robert Johnson, has an office at 789 Corporate Blvd, Suite 500, Bigcity, State 13579. \nFor customer service, call 1-800-555-9876 or email support@example.com. \nSarah Lee, our HR manager, can be contacted at 444-333-2222 or sarah.lee@company.com.\nThe project budget is $250,000, with an additional $50,000 allocated for contingencies. \nMonthly maintenance costs are estimated at $3,500. \nFor international clients, please use +1-555-987-6543. \nOur tax ID number is 12-3456789." # Act result = asyncio.run(process.mask_content(test_text)) @@ -123,19 +116,19 @@ def test_simple_use_case(): unmasked_content = process.unmask_content(result.masked_text, result.mapping) assert unmasked_content == test_text -def test_deterministic_hashing(): - # Arrange - process = Process() - process.add_masking_llm("groq/llama-3.2-11b-text-preview", MaskingStrategy.DETERMINISTIC_HASHING) +# def test_deterministic_hashing(): +# # Arrange +# process = Process() +# process.add_masking_llm("groq/llama-3.2-11b-text-preview", MaskingStrategy.DETERMINISTIC_HASHING) - test_text = "John Doe transferred $5000 to Jane Smith on 2021-05-01." +# test_text = "John Doe transferred $5000 to Jane Smith on 2021-05-01." - # Act - result = asyncio.run(process.mask_content(test_text)) +# # Act +# result = asyncio.run(process.mask_content(test_text)) - # Assert - assert result.masked_text is not None - assert result.mapping is not None +# # Assert +# assert result.masked_text is not None +# assert result.mapping is not None if __name__ == "__main__": test_mask() \ No newline at end of file From 780f9d3d6ece99c9c4e430f08c1257d8741afd52 Mon Sep 17 00:00:00 2001 From: julio Almeida Date: Tue, 19 Nov 2024 19:30:50 +0100 Subject: [PATCH 10/12] EM version without middleware interaction --- .../masking/llm_masking_strategy.py | 2 +- tests/test_process.py | 30 ++++++++++++++++--- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/extract_thinker/masking/llm_masking_strategy.py b/extract_thinker/masking/llm_masking_strategy.py index 6fee20b..050a095 100644 --- a/extract_thinker/masking/llm_masking_strategy.py +++ b/extract_thinker/masking/llm_masking_strategy.py @@ -8,7 +8,7 @@ async def mask_content(self, content: str) -> MaskContract: messages_step1 = [ { "role": "system", - "content": "You are an AI assistant that masks sensitive information in text." + "content": "You are an AI assistant that masks only Personally Identifiable Information (PII) in text. Replace PII with placeholders in the format [TYPE#], e.g., [PERSON1], [ADDRESS1], [EMAIL1], etc. Do not mask numerical values or non-PII data." }, { "role": "user", diff --git a/tests/test_process.py b/tests/test_process.py index d940bfb..b7ee699 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -33,8 +33,7 @@ def test_mask(): # Act test_text = "Mr. George Collins lives at 123 Main St, Anytown, USA 12345.\n His phone number is 555-1234.\nJane Smith resides at 456 Elm Avenue, Othercity, State 67890, and can be reached at (987) 654-3210.\nThe company's CEO, Robert Johnson, has an office at 789 Corporate Blvd, Suite 500, Bigcity, State 13579. \nFor customer service, call 1-800-555-9876 or email support@example.com. \nSarah Lee, our HR manager, can be contacted at 444-333-2222 or sarah.lee@company.com.\nThe project budget is $250,000, with an additional $50,000 allocated for contingencies. \nMonthly maintenance costs are estimated at $3,500. \nFor international clients, please use +1-555-987-6543. \nOur tax ID number is 12-3456789." - - # Act + result = asyncio.run(process.mask_content(test_text)) # Assert @@ -84,8 +83,18 @@ def test_mask(): # Test unmasking unmasked_content = process.unmask_content(result.masked_text, result.mapping) - # Optionally, verify the entire unmasked content matches the original - assert unmasked_content == test_text, "Unmasked content does not match the original content" + # Normalize strings by standardizing whitespace and newlines + def normalize_string(s: str) -> str: + # Replace all whitespace sequences (including newlines) with a single space + # and strip leading/trailing whitespace + return ' '.join(s.split()) + + # Test unmasking with normalized strings + normalized_unmasked = normalize_string(unmasked_content) + normalized_original = normalize_string(test_text) + + # Compare normalized strings + assert normalized_unmasked == normalized_original, "Unmasked content does not match the original content" def test_simple_use_case(): # Arrange @@ -133,6 +142,19 @@ def test_deterministic_hashing(): test_text = "John Doe transferred $5000 to Jane Smith on 2021-05-01." + # Normalize strings by standardizing whitespace and newlines + def normalize_string(s: str) -> str: + # Replace all whitespace sequences (including newlines) with a single space + # and strip leading/trailing whitespace + return ' '.join(s.split()) + + # Test unmasking with normalized strings + normalized_unmasked = normalize_string(result.masked_text) + normalized_original = normalize_string(test_text) + + # Compare normalized strings + assert normalized_unmasked == normalized_original, "Unmasked content does not match the original content" + # Act result = asyncio.run(process.mask_content(test_text)) From 11d056b12544e3a97d8aee884232a6e28793c384 Mon Sep 17 00:00:00 2001 From: julio Almeida Date: Wed, 20 Nov 2024 11:22:40 +0100 Subject: [PATCH 11/12] Making Type uppercase for ruff verification --- extract_thinker/extractor.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/extract_thinker/extractor.py b/extract_thinker/extractor.py index 3a59ea0..a137a31 100644 --- a/extract_thinker/extractor.py +++ b/extract_thinker/extractor.py @@ -83,7 +83,7 @@ def load_llm(self, model: Optional[str] = None) -> None: def extract( self, source: Union[str, IO, list], - response_model: type[BaseModel], + response_model: Type[BaseModel], vision: bool = False, content: Optional[str] = None, ) -> Any: @@ -115,20 +115,20 @@ def extract( async def extract_async( self, source: Union[str, IO, list], - response_model: type[BaseModel], + response_model: Type[BaseModel], vision: bool = False, ) -> Any: return await asyncio.to_thread(self.extract, source, response_model, vision) def extract_from_content( - self, content: str, response_model: type[BaseModel], vision: bool = False + self, content: str, response_model: Type[BaseModel], vision: bool = False ) -> str: return self._extract(content, None, response_model, vision) def extract_from_list( self, data: List[Dict[Any, Any]], - response_model: type[BaseModel], + response_model: Type[BaseModel], vision: bool ) -> str: # check if document_loader is None, raise error @@ -146,7 +146,7 @@ def extract_from_list( return self._extract(content, data, response_model, vision, is_stream=False) def extract_from_file( - self, file: str, response_model: type[BaseModel], vision: bool = False + self, file: str, response_model: Type[BaseModel], vision: bool = False ) -> str: if self.document_loader is not None: content = self.document_loader.load_content_from_file(file) @@ -158,7 +158,7 @@ def extract_from_file( return self._extract(content, file, response_model, vision) def extract_from_stream( - self, stream: IO, response_model: type[BaseModel], vision: bool = False + self, stream: IO, response_model: Type[BaseModel], vision: bool = False ) -> str: # check if document_loader is None, raise error if self.document_loader is None: @@ -401,7 +401,7 @@ def split_content( chunks.append(current_chunk.strip()) return chunks - def aggregate_results(self, results: List[Any], response_model: type[BaseModel]) -> Any: + def aggregate_results(self, results: List[Any], response_model: Type[BaseModel]) -> Any: if len(results) == 1: return results[0] From ac5233a854c2bb2af621bd653fe45fc1d593fca4 Mon Sep 17 00:00:00 2001 From: julio Almeida Date: Wed, 20 Nov 2024 11:32:05 +0100 Subject: [PATCH 12/12] change | to Union (py 39) --- extract_thinker/process.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extract_thinker/process.py b/extract_thinker/process.py index 995bee0..57163bb 100644 --- a/extract_thinker/process.py +++ b/extract_thinker/process.py @@ -47,7 +47,7 @@ def __init__(self): self.masking_strategy: Optional[AbstractMaskingStrategy] = None self.llm: Optional[LLM] = None - def add_masking_llm(self, model: Optional[str | LLM] = None, strategy: Optional[MaskContract] = MaskingStrategy.SIMPLE_PLACEHOLDER): + def add_masking_llm(self, model: Optional[Union[str, LLM]] = None, strategy: Optional[MaskContract] = MaskingStrategy.SIMPLE_PLACEHOLDER): if isinstance(model, LLM): self.llm = model elif model is not None: