diff --git a/extract_thinker/batch_job.py b/extract_thinker/batch_job.py index 0a8753d..fa2e697 100644 --- a/extract_thinker/batch_job.py +++ b/extract_thinker/batch_job.py @@ -2,9 +2,9 @@ from typing import Any, List, Type, Iterator, Optional from pydantic import BaseModel from openai import OpenAI -from instructor.batch import BatchJob as InstructorBatchJob import json import os +from instructor.batch import BatchJob as InstructorBatchJob SLEEP_TIME = 60 diff --git a/extract_thinker/extractor.py b/extract_thinker/extractor.py index c88bcc1..def48ab 100644 --- a/extract_thinker/extractor.py +++ b/extract_thinker/extractor.py @@ -1,3 +1,4 @@ +import os import asyncio import base64 from typing import Any, Dict, List, Optional, IO, Type, Union, get_origin @@ -136,7 +137,7 @@ def set_skip_loading(self, skip: bool = True) -> None: def extract( self, source: Union[str, IO, list], - response_model: type[BaseModel], + response_model: Type[BaseModel], vision: bool = False, content: Optional[str] = None, completion_strategy: Optional[CompletionStrategy] = CompletionStrategy.FORBIDDEN @@ -241,7 +242,7 @@ def _map_to_universal_format( async def extract_async( self, source: Union[str, IO, list], - response_model: type[BaseModel], + response_model: Type[BaseModel], vision: bool = False, completion_strategy: Optional[CompletionStrategy] = CompletionStrategy.FORBIDDEN ) -> Any: @@ -694,7 +695,7 @@ def split_content( chunks.append(current_chunk.strip()) return chunks - def aggregate_results(self, results: List[Any], response_model: type[BaseModel]) -> Any: + def aggregate_results(self, results: List[Any], response_model: Type[BaseModel]) -> Any: if len(results) == 1: return results[0] diff --git a/extract_thinker/llm.py b/extract_thinker/llm.py index eed68bc..4f5029e 100644 --- a/extract_thinker/llm.py +++ b/extract_thinker/llm.py @@ -1,4 +1,4 @@ -from typing import List, Dict, Any +from typing import List, Dict, Any, Optional import instructor import litellm from litellm import Router @@ -18,7 +18,12 @@ def __init__(self, def load_router(self, router: Router) -> None: self.router = router - def request(self, messages: List[Dict[str, str]], response_model: str) -> Any: + def request( + self, + messages: List[Dict[str, str]], + response_model: Optional[str] = None + ) -> Any: + # Uncomment the following lines if you need to calculate max_tokens # contents = map(lambda message: message['content'], messages) # all_contents = ' '.join(contents) # max_tokens = num_tokens_from_string(all_contents) diff --git a/extract_thinker/masking/abstract_masking_strategy.py b/extract_thinker/masking/abstract_masking_strategy.py new file mode 100644 index 0000000..d96950f --- /dev/null +++ b/extract_thinker/masking/abstract_masking_strategy.py @@ -0,0 +1,15 @@ +from abc import ABC, abstractmethod +from extract_thinker.models.MaskContract import MaskContract +from extract_thinker.llm import LLM + +class AbstractMaskingStrategy(ABC): + def __init__(self, llm: LLM): + self.llm = llm + + @abstractmethod + async def mask_content(self, content: str) -> MaskContract: + pass + + @abstractmethod + def unmask_content(self, masked_content: str, mapping: dict) -> str: + pass \ No newline at end of file diff --git a/extract_thinker/masking/deterministic_hashing_masking_strategy.py b/extract_thinker/masking/deterministic_hashing_masking_strategy.py new file mode 100644 index 0000000..c0b4f9d --- /dev/null +++ b/extract_thinker/masking/deterministic_hashing_masking_strategy.py @@ -0,0 +1,220 @@ +import re +import hashlib +from extract_thinker.llm import LLM +from extract_thinker.masking.abstract_masking_strategy import AbstractMaskingStrategy +from extract_thinker.models.MaskContract import MaskContract, MaskContractDict +from cryptography.hazmat.primitives import hashes +from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC +from cryptography.hazmat.backends import default_backend +import base64 + + +class DeterministicHashingMaskingStrategy(AbstractMaskingStrategy): + MASK_PII_PROMPT = ( + "You are an AI assistant that masks only Personally Identifiable Information (PII) in text. " + "Replace PII with placeholders in the format [TYPE#], e.g., [PERSON1], [ADDRESS1], [EMAIL1], etc. " + "Do not mask numerical values or non-PII data. Ensure placeholders do not contain underscores or spaces." + "Do not mask the key value result, they will be masked later." + "Don't return masked text, only the placeholder list." + "Values and Amounts(e.g $1000) are not PII values. The same for dates" + "Provide a step-by-step reasoning when identifying PII." + "Always return ##Placeholder list: as part of the response" + ) + + MASK_PII_USER_PROMPT = """Task: Mask personally identifiable information (PII) in the provided text, replacing PII with placeholders like [PERSON1], [ADDRESS1], [EMAIL1], etc. Do not mask numerical values unless they are phone numbers or tax IDs. Return only the placeholder list with reasoning for each identified PII. + +Step 1: Reasoning & Thought Process +1. Analyze the text: + - Carefully examine each part of the text to determine if it contains PII. + - Focus on identifying common types of PII such as names, email addresses, phone numbers, tax IDs, and physical addresses. + - Ignore non-PII data such as dates, numerical values (except phone numbers and tax IDs), and any other non-sensitive information. + +2. Justify the decision: + - For each segment identified as PII, explain why it qualifies as such. + - Clearly differentiate between PII and non-PII elements. Provide reasoning for why certain elements are not PII. + +Step 2: Action +1. Mask PII: + - Replace each identified PII with an appropriate placeholder in the format [TYPE#] (e.g., [PERSON1], [ADDRESS1]). + - Do not mask any non-PII elements. + +2. Return placeholder list: + - Return a list of placeholders and their corresponding original values (but do not return the masked text). + - Ensure placeholders are formatted without underscores or spaces. + +Examples: + +Example 1: +Original text: +John Smith lives at 123 Main St, New York, NY 10001. His phone number is (555) 123-4567 and his SSN is 123-45-6789. For international calls, use +1-555-987-6543. He deposited $5,000 on 2023-07-15. + +Output: +##Placeholder list: +[PERSON1]: John Smith +[ADDRESS1]: 123 Main St, New York, NY 10001 +[PHONE1]: (555) 123-4567 +[TAXID1]: 123-45-6789 +[PHONE2]: +1-555-987-6543 + +Example 2: +Original text: +Sarah Johnson ordered a laptop from TechStore on 2023-05-15. Her email is sarah.j@email.com and her work number is 1-800-555-1234. The company's EIN is 12-3456789. The total amount was $1,200. + +Output: +##Placeholder list: +[PERSON1]: Sarah Johnson +[EMAIL1]: sarah.j@email.com +[PHONE1]: 1-800-555-1234 +[TAXID1]: 12-3456789 + +Example 3 (Demonstrating what NOT to mask): +Original text: +The company's revenue was $10,000,000 last year. Project XYZ has a budget of $500,000 and is due on 2023-12-31. The office can accommodate 50 employees. + +Note: In this example, no masking is performed because there is no PII present. Numerical values (except phone numbers and tax IDs), project names, and dates are not considered PII. + +Example 4: +Original text: +John Doe transferred $5000 to Jane Smith on 2021-05-01. + +Step 1: Reasoning & Thought Process +Upon analyzing the text "John Doe transferred $5000 to Jane Smith on 2021-05-01.", we need to identify any PII present. + +1. Identifying PII Types: The common types of PII we're looking for are names (e.g., John Doe, Jane Smith), email addresses, phone numbers, tax IDs, and physical addresses. +2. Examining Text Segments: + - "John Doe" - This is a name, which is a type of PII. + - "Jane Smith" - This is another name, which is a type of PII. + - "$5000" - This is a financial transaction amount, not a phone number or tax ID, so it's not a type of PII in this context. Numerical values like this are often found in everyday text and aren't PII. + - "2021-05-01" - This is a date, which is not PII because it doesn't contain identifying information about a person. + +Step 2: Action +Based on the identified PII types and segments, we'll create placeholders for each PII found. + +1. Masking PII: We'll replace each identified PII with an appropriate placeholder in the format [TYPE#]. +2. Returning Placeholder List: We'll return a list of placeholders and their corresponding original values. + +Output: +##Placeholder list: +[PERSON1]: John Doe +[PERSON2]: Jane Smith + +Text to mask: +{content} + +Provide your step-by-step reasoning, and then return the placeholder list. +""" + + CONVERT_TO_JSON_PROMPT = ( + "You are an AI assistant that converts placeholder lists into JSON format. " + "Ensure that placeholders are strictly in the format [TYPE#], without underscores or spaces." + ) + + CONVERT_TO_JSON_USER_PROMPT = """Convert the following placeholder lists into a JSON format. For each example, the JSON should have a single key: "mapping" (a dictionary of placeholders and their original PII values). Ensure placeholders are in the correct format [TYPE#], without underscores or spaces. + +Example 1: +Placeholder list: +[PERSON1]: John Smith +[ADDRESS1]: 123 Main St, New York, NY 10001 +[PHONE1]: (555) 123-4567 + +Output: +{{ + "mapping": {{ + "[PERSON1]": "John Smith", + "[ADDRESS1]": "123 Main St, New York, NY 10001", + "[PHONE1]": "(555) 123-4567" + }} +}} + +Example 2: +Placeholder list: +[PERSON1]: Sarah Johnson +[EMAIL1]: sarah.j@email.com + +Output: +{{ + "mapping": {{ + "[PERSON1]": "Sarah Johnson", + "[EMAIL1]": "sarah.j@email.com" + }} +}} + +Now, please convert the following placeholder list into JSON format: + +{response_step1_content} + +##JSON +""" + + def __init__(self, llm: LLM): + super().__init__(llm) + self.placeholder_counter = {} + + async def mask_content(self, content: str) -> MaskContract: + response_step1_content = await self._step1_mask_pii(content) + response_step2_content = await self._step2_convert_to_json(response_step1_content) + result = self._parse_mask_contract_dict(response_step2_content.mapping, content) + return result + + def _parse_mask_contract_dict(self, mapping: dict, content: str) -> MaskContract: + masked_text = content + for placeholder, value in mapping.items(): + hash_value = self._deterministic_hash(value) + masked_text = masked_text.replace(value, f"{hash_value}") + return MaskContract(masked_text=masked_text, mapping=mapping) + + async def _step1_mask_pii(self, content: str) -> str: + messages_step1 = [ + {"role": "system", "content": self.MASK_PII_PROMPT}, + {"role": "user", "content": self.MASK_PII_USER_PROMPT.format(content=content)}, + ] + response_step1 = self.llm.request(messages_step1) + response_step1_content = response_step1.choices[0].message.content + + # Split the response into reasoning and the placeholder list + split_result = response_step1_content.split("##Placeholder list:") + if len(split_result) == 2: + reasoning_part = split_result[0].strip() + placeholder_list = split_result[1].strip() + else: + raise ValueError("Unexpected response format: 'Placeholder List' section not found.") + + # Return only the placeholder list + return placeholder_list + + async def _step2_convert_to_json(self, response_step1_content: str) -> MaskContractDict: + messages_step2 = [ + {"role": "system", "content": self.CONVERT_TO_JSON_PROMPT}, + { + "role": "user", + "content": self.CONVERT_TO_JSON_USER_PROMPT.format( + response_step1_content=response_step1_content, + ), + }, + ] + response_step2 = self.llm.request(messages_step2, MaskContractDict) + return response_step2 + + def _validate_placeholders(self, mask_contract: MaskContract): + placeholder_pattern = re.compile(r'^\[[A-Za-z]+[0-9]*\]$') + for placeholder in mask_contract.mapping.keys(): + if not placeholder_pattern.match(placeholder): + raise ValueError(f"Invalid placeholder format: {placeholder}") + + def unmask_content(self, masked_content: str, mapping: dict) -> str: + for placeholder, original in mapping.items(): + masked_content = masked_content.replace(placeholder, original) + return masked_content + + def _deterministic_hash(self, value: str) -> str: + # Generate a deterministic hash using PBKDF2HMAC with SHA256 + salt = b'some_constant_salt' + kdf = PBKDF2HMAC( + algorithm=hashes.SHA256(), + length=32, + salt=salt, + iterations=100000, + backend=default_backend() + ) + hashed = kdf.derive(value.encode()) + return base64.urlsafe_b64encode(hashed).decode('utf-8') diff --git a/extract_thinker/masking/llm_masking_strategy.py b/extract_thinker/masking/llm_masking_strategy.py new file mode 100644 index 0000000..050a095 --- /dev/null +++ b/extract_thinker/masking/llm_masking_strategy.py @@ -0,0 +1,105 @@ +from extract_thinker.masking.abstract_masking_strategy import AbstractMaskingStrategy +from extract_thinker.models.MaskContract import MaskContract +from extract_thinker.llm import LLM + +class LLMMaskingStrategy(AbstractMaskingStrategy): + async def mask_content(self, content: str) -> MaskContract: + # Step 1: Get masked text and placeholder list + messages_step1 = [ + { + "role": "system", + "content": "You are an AI assistant that masks only Personally Identifiable Information (PII) in text. Replace PII with placeholders in the format [TYPE#], e.g., [PERSON1], [ADDRESS1], [EMAIL1], etc. Do not mask numerical values or non-PII data." + }, + { + "role": "user", + "content": f""" + Please mask all sensitive information in the following text. Replace sensitive information with placeholders like [PERSON1], [PERSON2], [ADDRESS1], [ADDRESS2], [PHONE1], [PHONE2], etc. Return the masked text and a list of placeholders with their original values. + + Here are some examples: + + Example 1: + Original text: + John Smith lives at 123 Main St, New York, NY 10001. His phone number is (555) 123-4567. + + Placeholder list: + [PERSON1]: John Smith + [ADDRESS1]: 123 Main St, New York, NY 10001 + [PHONE1]: (555) 123-4567 + + Masked text: + [PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]. + + Example 2: + Original text: + Sarah Johnson ordered a laptop from TechStore on 2023-05-15. Her email is sarah.j@email.com. + + Placeholder list: + [PERSON1]: Sarah Johnson + [PRODUCT1]: laptop + [STORE1]: TechStore + [DATE1]: 2023-05-15 + [EMAIL1]: sarah.j@email.com + + Masked text: + [PERSON1] ordered a [PRODUCT1] from [STORE1] on [DATE1]. Her email is [EMAIL1]. + + Now, please mask the following text: + + Text to mask: + {content} + + Give me the placeholder list with the value and respective placeholder, and then the Masked text with the placeholders. + """ + } + ] + + response_step1 = self.llm.request(messages_step1) + response_step1_content = response_step1.choices[0].message.content + + # Step 2: Convert to JSON format + messages_step2 = [ + { + "role": "system", + "content": "You are an AI assistant that converts masked text information into JSON format." + }, + { + "role": "user", + "content": f""" + Convert the following masked texts and placeholder lists into a JSON format. For each example, the JSON should have two main keys: "mapping" (a dictionary of placeholders and their original values) and "masked_text" (the text with placeholders). + Always use [], not "" or '' + Make sure that masked_text contains no sensitive information, only the placeholders. + + Example 1: + Placeholder list: + [PERSON1]: John Smith + [ADDRESS1]: 123 Main St, New York, NY 10001 + [PHONE1]: (555) 123-4567 + + Masked text: + [PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]. + + Output: + {{ + "mapping": {{ + "[PERSON1]": "John Smith", + "[ADDRESS1]": "123 Main St, New York, NY 10001", + "[PHONE1]": "(555) 123-4567" + }}, + "masked_text": "[PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1]." + }} + + Now, please convert the following masked text and placeholder list into JSON format: + + {response_step1_content} + + ##JSON + """ + } + ] + + return self.llm.request(messages_step2, MaskContract) + + def unmask_content(self, masked_content: str, mapping: dict) -> str: + for placeholder, original in mapping.items(): + masked_content = masked_content.replace(placeholder, original) + return masked_content \ No newline at end of file diff --git a/extract_thinker/masking/mocked_data_masking_strategy.py b/extract_thinker/masking/mocked_data_masking_strategy.py new file mode 100644 index 0000000..d62309f --- /dev/null +++ b/extract_thinker/masking/mocked_data_masking_strategy.py @@ -0,0 +1,143 @@ +import re +from extract_thinker.llm import LLM +from extract_thinker.masking.abstract_masking_strategy import AbstractMaskingStrategy +from extract_thinker.models.MaskContract import MaskContract + +class MockedDataMaskingStrategy(AbstractMaskingStrategy): + def __init__(self, llm: LLM): + super().__init__(llm) + + async def mask_content(self, content: str) -> MaskContract: + # Step 1: Get masked text and mocked data mapping + messages_step1 = [ + { + "role": "system", + "content": "You are an AI assistant that masks sensitive information in text with mocked data." + }, + { + "role": "user", + "content": f""" + Please mask all sensitive information in the following text with mocked data. Replace sensitive information with realistic but fake data. Return the masked text and a mapping of original values to mocked data. + - Keep all values, doesnt constitute sensitive information + + Here are some examples: + + Example 1: + Original text: + John Smith lives at 123 Main St, New York, NY 10001. His phone number is (555) 123-4567. + + Mocked data mapping: + "John Smith": "Michael Johnson" + "123 Main St, New York, NY 10001": "456 Oak Ave, Chicago, IL 60601" + "(555) 123-4567": "(312) 555-7890" + + Masked text: + Michael Johnson lives at 456 Oak Ave, Chicago, IL 60601. His phone number is (312) 555-7890. + + Example 2: + Original text: + Sarah Johnson ordered a laptop from TechStore on 2023-05-15. Her email is sarah.j@email.com. + + Mocked data mapping: + "Sarah Johnson": "Emma Thompson" + "laptop": "tablet" + "TechStore": "GadgetWorld" + "2023-05-15": "2023-06-22" + "sarah.j@email.com": "emma.t@fakemail.com" + + Masked text: + Emma Thompson ordered a tablet from GadgetWorld on 2023-06-22. Her email is emma.t@fakemail.com. + + Now, please mask the following text: + + Text to mask: + {content} + + Give me the mocked data mapping with the original value and respective mocked data, and then the Masked text with the mocked data. + """ + } + ] + + response_step1 = self.llm.request(messages_step1) + + response_step1_content = response_step1.choices[0].message.content + + # Step 2: Convert to JSON format + messages_step2 = [ + { + "role": "system", + "content": "You are an AI assistant that converts masked text information into JSON format." + }, + { + "role": "user", + "content": f""" + Convert the following masked texts and mocked data mappings into a JSON format. For each example, the JSON should have two main keys: "mapping" (a dictionary of original values and their mocked data) and "masked_text" (the text with mocked data). + + Example 1: + Mocked data mapping: + "John Smith": "Michael Johnson" + "123 Main St, New York, NY 10001": "456 Oak Ave, Chicago, IL 60601" + "(555) 123-4567": "(312) 555-7890" + + Masked text: + Michael Johnson lives at 456 Oak Ave, Chicago, IL 60601. His phone number is (312) 555-7890. + + Output: + {{ + "mapping": {{ + "John Smith": "Michael Johnson", + "123 Main St, New York, NY 10001": "456 Oak Ave, Chicago, IL 60601", + "(555) 123-4567": "(312) 555-7890" + }}, + "masked_text": "Michael Johnson lives at 456 Oak Ave, Chicago, IL 60601. His phone number is (312) 555-7890." + }} + + Example 2: + Mocked data mapping: + "Sarah Johnson": "Emma Thompson" + "laptop": "tablet" + "TechStore": "GadgetWorld" + "2023-05-15": "2023-06-22" + "sarah.j@email.com": "emma.t@fakemail.com" + + Masked text: + Emma Thompson ordered a tablet from GadgetWorld on 2023-06-22. Her email is emma.t@fakemail.com. + + Output: + {{ + "mapping": {{ + "Sarah Johnson": "Emma Thompson", + "laptop": "tablet", + "TechStore": "GadgetWorld", + "2023-05-15": "2023-06-22", + "sarah.j@email.com": "emma.t@fakemail.com" + }}, + "masked_text": "Emma Thompson ordered a tablet from GadgetWorld on 2023-06-22. Her email is emma.t@fakemail.com." + }} + + Now, please convert the following masked text and mocked data mapping into JSON format: + + {response_step1_content} + + ##JSON + """ + } + ] + + response_step2 = self.llm.request(messages_step2, MaskContract) + + masked_text = response_step2.masked_text + mapping = response_step2.mapping + + for original, mocked in mapping.items(): + if original in masked_text: + masked_text = masked_text.replace(original, mocked) + + response_step2.masked_text = masked_text + + return response_step2 + + def unmask_content(self, masked_content: str, mapping: dict) -> str: + for mocked, original in mapping.items(): + masked_content = masked_content.replace(mocked, original) + return masked_content \ No newline at end of file diff --git a/extract_thinker/masking/simple_placeholder_masking_strategy.py b/extract_thinker/masking/simple_placeholder_masking_strategy.py new file mode 100644 index 0000000..6387465 --- /dev/null +++ b/extract_thinker/masking/simple_placeholder_masking_strategy.py @@ -0,0 +1,55 @@ +import re +from extract_thinker.llm import LLM +from extract_thinker.masking.abstract_masking_strategy import AbstractMaskingStrategy +from extract_thinker.models.MaskContract import MaskContract +import asyncio + +class SimplePlaceholderMaskingStrategy(AbstractMaskingStrategy): + MASK_PII_PROMPT = ( + "You are an AI assistant that masks only Personally Identifiable Information (PII) in text. " + "Replace PII with placeholders in the format [TYPE#], e.g., [PERSON1], [ADDRESS1], [EMAIL1], etc. " + "Do not mask numerical values or non-PII data. Values and Amounts(e.g $1000) are not PII values. " + "The same applies for dates. Return the masked text and mapping in JSON format." + "Rethink what you did to make sure that you masking every PII value." + ) + + MASK_PII_USER_PROMPT = '''Mask personally identifiable information (PII) in the provided text, replacing PII with placeholders like [PERSON1], [ADDRESS1], [EMAIL1], etc. Return both the masked text and mapping in JSON format. + +Example: +Input: +John Smith lives at 123 Main St, New York, NY 10001. His phone number is (555) 123-4567 and his SSN is 123-45-6789. He deposited $5,000 on 2023-07-15. + +Output: +{{ + "mapping": {{ + "[PERSON1]": "John Smith", + "[ADDRESS1]": "123 Main St, New York, NY 10001", + "[PHONE1]": "(555) 123-4567", + "[TAXID1]": "123-45-6789" + }}, + "masked_text": "[PERSON1] lives at [ADDRESS1]. His phone number is [PHONE1] and his SSN is [TAXID1]. He deposited $5,000 on 2023-07-15." +}} + +Text to mask: +{content} + +Return the response in JSON format. +##JSON''' + + def __init__(self, llm: LLM): + super().__init__(llm) + + async def mask_content(self, content: str) -> MaskContract: + messages = [ + {"role": "system", "content": self.MASK_PII_PROMPT}, + {"role": "user", "content": self.MASK_PII_USER_PROMPT.format(content=content)}, + ] + + return self.llm.request(messages, MaskContract) + + def unmask_content(self, masked_content: str, mapping: dict) -> str: + unmasked_text = masked_content + sorted_mapping = dict(sorted(mapping.items(), key=lambda x: -len(x[0]))) + for placeholder, original in sorted_mapping.items(): + unmasked_text = unmasked_text.replace(placeholder, original) + return unmasked_text \ No newline at end of file diff --git a/extract_thinker/models/MaskContract.py b/extract_thinker/models/MaskContract.py new file mode 100644 index 0000000..0e3cac2 --- /dev/null +++ b/extract_thinker/models/MaskContract.py @@ -0,0 +1,12 @@ +from pydantic import BaseModel, Field +from typing import Dict, List + +class MaskContract(BaseModel): + mapping: Dict[str, str] = Field(description="A dictionary mapping placeholders to original values") + masked_text: str = Field(description="The masked version of the text") + + def __init__(self, masked_text: str, mapping: Dict[str, str]): + super().__init__(masked_text=masked_text, mapping=mapping) + +class MaskContractDict(BaseModel): + mapping: Dict[str, str] = Field(description="A dictionary mapping placeholders to original values") \ No newline at end of file diff --git a/extract_thinker/process.py b/extract_thinker/process.py index e9e816c..9b2dac3 100644 --- a/extract_thinker/process.py +++ b/extract_thinker/process.py @@ -7,6 +7,7 @@ from extract_thinker.models.doc_groups2 import DocGroups2 from extract_thinker.models.splitting_strategy import SplittingStrategy from extract_thinker.extractor import Extractor +from extract_thinker.masking.deterministic_hashing_masking_strategy import DeterministicHashingMaskingStrategy from extract_thinker.models.classification import Classification from extract_thinker.document_loader.document_loader import DocumentLoader from extract_thinker.models.classification_tree import ClassificationTree @@ -17,6 +18,23 @@ DocGroups, ) from extract_thinker.utils import get_image_type +from extract_thinker.llm import LLM +from extract_thinker.models.MaskContract import MaskContract +from enum import Enum +from extract_thinker.masking.llm_masking_strategy import LLMMaskingStrategy +from extract_thinker.masking.simple_placeholder_masking_strategy import SimplePlaceholderMaskingStrategy +from extract_thinker.masking.mocked_data_masking_strategy import MockedDataMaskingStrategy +from extract_thinker.masking.abstract_masking_strategy import AbstractMaskingStrategy + +class ClassificationStrategy(Enum): + CONSENSUS = "consensus" + HIGHER_ORDER = "higher_order" + CONSENSUS_WITH_THRESHOLD = "both" + +class MaskingStrategy(Enum): + SIMPLE_PLACEHOLDER = "simple_placeholder" + MOCKED_DATA = "mocked_data" + DETERMINISTIC_HASHING = "deterministic_hashing" class Process: def __init__(self): @@ -30,6 +48,33 @@ def __init__(self): self.file_stream: Optional[IO] = None self.splitter: Optional[Splitter] = None self._content_loaded: bool = False # New internal flag + self.masking_strategy: Optional[AbstractMaskingStrategy] = None + self.llm: Optional[LLM] = None + + def add_masking_llm(self, model: Optional[Union[str, LLM]] = None, strategy: Optional[MaskContract] = MaskingStrategy.SIMPLE_PLACEHOLDER): + if isinstance(model, LLM): + self.llm = model + elif model is not None: + self.llm = LLM(model) + + if strategy == MaskingStrategy.SIMPLE_PLACEHOLDER: + self.masking_strategy = SimplePlaceholderMaskingStrategy(self.llm) + elif strategy == MaskingStrategy.MOCKED_DATA: + self.masking_strategy = MockedDataMaskingStrategy(self.llm) + elif strategy == MaskingStrategy.DETERMINISTIC_HASHING: + self.masking_strategy = DeterministicHashingMaskingStrategy(self.llm) + + async def mask_content(self, content: str) -> MaskContract: + if self.masking_strategy is None: + raise ValueError("No masking strategy has been set. Please set a masking strategy with add_masking_strategy.") + + return await self.masking_strategy.mask_content(content) + + def unmask_content(self, masked_content: str, mapping: dict) -> str: + if self.masking_strategy is None: + raise ValueError("No masking strategy has been set. Please set a masking strategy with add_masking_strategy.") + + return self.masking_strategy.unmask_content(masked_content, mapping) def set_document_loader_for_file_type(self, file_type: str, document_loader: DocumentLoader): if self.document_loader is not None: @@ -300,4 +345,4 @@ async def process_doc_groups(groups: List[Any]) -> List[Any]: process_doc_groups(self.doc_groups) ) - return processedGroups + return processedGroups \ No newline at end of file diff --git a/tests/test_process.py b/tests/test_process.py index 44517cb..eb51b07 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -1,19 +1,167 @@ +import asyncio import os +import sys +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from dotenv import load_dotenv from extract_thinker import Contract, Extractor, Process, Classification +from extract_thinker.document_loader.document_loader_pypdf import DocumentLoaderPyPdf from extract_thinker.document_loader.document_loader_tesseract import DocumentLoaderTesseract +from extract_thinker.llm import LLM from extract_thinker.models.splitting_strategy import SplittingStrategy +from extract_thinker.process import MaskingStrategy from tests.models.invoice import InvoiceContract from tests.models.driver_license import DriverLicense from extract_thinker.image_splitter import ImageSplitter from extract_thinker.text_splitter import TextSplitter import pytest -# Setup environment and paths load_dotenv() +cwd = os.getcwd() CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) MULTI_PAGE_DOC_PATH = os.path.join(CURRENT_DIR, "files", "bulk.pdf") +def test_mask(): + # Arrange + test_file_path = os.path.join(cwd, "tests", "files", "invoice.pdf") + + process = Process() + process.load_document_loader(DocumentLoaderPyPdf()) + process.load_file(test_file_path) + # process.add_masking_llm("groq/llama-3.2-3b-preview") + llm = LLM("ollama/deepseek-r1:1.5b") + process.add_masking_llm(llm) + + # Act + test_text = "Mr. George Collins lives at 123 Main St, Anytown, USA 12345.\n His phone number is 555-1234.\nJane Smith resides at 456 Elm Avenue, Othercity, State 67890, and can be reached at (987) 654-3210.\nThe company's CEO, Robert Johnson, has an office at 789 Corporate Blvd, Suite 500, Bigcity, State 13579. \nFor customer service, call 1-800-555-9876 or email support@example.com. \nSarah Lee, our HR manager, can be contacted at 444-333-2222 or sarah.lee@company.com.\nThe project budget is $250,000, with an additional $50,000 allocated for contingencies. \nMonthly maintenance costs are estimated at $3,500. \nFor international clients, please use +1-555-987-6543. \nOur tax ID number is 12-3456789." + + result = asyncio.run(process.mask_content(test_text)) + + # Assert + assert result.masked_text is not None + assert result.mapping is not None + + # Check if all original PII is masked + pii_info = { + "persons": ["George Collins", "Jane Smith", "Robert Johnson", "Sarah Lee"], + "addresses": [ + "123 Main St, Anytown, USA 12345", + "456 Elm Avenue, Othercity, State 67890", + "789 Corporate Blvd, Suite 500, Bigcity, State 13579", + ], + "phones": ["555-1234", "(987) 654-3210", "1-800-555-9876", "444-333-2222", "+1-555-987-6543"], + "emails": ["support@example.com", "sarah.lee@company.com"], + "tax_id": ["12-3456789"], + } + + non_pii_info = [ + "Monthly maintenance costs are estimated at $3,500.", + ] + + # Ensure PII is masked + for person in pii_info["persons"]: + assert person not in result.masked_text, f"PII {person} was not masked properly" + + for address in pii_info["addresses"]: + assert address not in result.masked_text, f"PII address {address} was not masked properly" + + for phone in pii_info["phones"]: + assert phone not in result.masked_text, f"PII phone {phone} was not masked properly" + + for email in pii_info["emails"]: + assert email not in result.masked_text, f"PII email {email} was not masked properly" + + for tax in pii_info["tax_id"]: + assert tax not in result.masked_text, f"PII tax ID {tax} was not masked properly" + + # Ensure non-PII data remains unchanged + for info in non_pii_info: + assert info in result.masked_text, f"Non-PII {info} was unexpectedly masked" + + # check if mapping length is 15 + assert len(result.mapping) == 15, "Mapping should contain 15 items" + + # Test unmasking + unmasked_content = process.unmask_content(result.masked_text, result.mapping) + + # Normalize strings by standardizing whitespace and newlines + def normalize_string(s: str) -> str: + # Replace all whitespace sequences (including newlines) with a single space + # and strip leading/trailing whitespace + return ' '.join(s.split()) + + # Test unmasking with normalized strings + normalized_unmasked = normalize_string(unmasked_content) + normalized_original = normalize_string(test_text) + + # Compare normalized strings + assert normalized_unmasked == normalized_original, "Unmasked content does not match the original content" + +def test_simple_use_case(): + # Arrange + test_file_path = os.path.join(cwd, "tests", "files", "invoice.pdf") + + process = Process() + process.load_document_loader(DocumentLoaderPyPdf()) + process.load_file(test_file_path) + process.add_masking_llm("groq/llama-3.2-11b-text-preview") + + # Arrange + test_text = "John Doe transferred $5000 to Jane Smith on 2021-05-01." + + # Act + result = asyncio.run(process.mask_content(test_text)) + + # Assert + assert result.masked_text is not None + assert result.mapping is not None + + # Ensure PII is masked + assert "John Doe" not in result.masked_text + assert "Jane Smith" not in result.masked_text + + # Ensure non-PII data remains + assert "$5000" in result.masked_text + assert "2021-05-01" in result.masked_text + assert "transferred" in result.masked_text + + # Check mapping + assert len(result.mapping) == 2 + assert "[PERSON1]" in result.mapping + assert "[PERSON2]" in result.mapping + assert result.mapping["[PERSON1]"] == "John Doe" + assert result.mapping["[PERSON2]"] == "Jane Smith" + + # Test unmasking + unmasked_content = process.unmask_content(result.masked_text, result.mapping) + assert unmasked_content == test_text + +def test_deterministic_hashing(): + # Arrange + process = Process() + process.add_masking_llm("groq/llama-3.2-11b-text-preview", MaskingStrategy.DETERMINISTIC_HASHING) + + test_text = "John Doe transferred $5000 to Jane Smith on 2021-05-01." + + # Normalize strings by standardizing whitespace and newlines + def normalize_string(s: str) -> str: + # Replace all whitespace sequences (including newlines) with a single space + # and strip leading/trailing whitespace + return ' '.join(s.split()) + + # Test unmasking with normalized strings + normalized_unmasked = normalize_string(result.masked_text) + normalized_original = normalize_string(test_text) + + # Compare normalized strings + assert normalized_unmasked == normalized_original, "Unmasked content does not match the original content" + + # Act + result = asyncio.run(process.mask_content(test_text)) + + # Assert + assert result.masked_text is not None + assert result.mapping is not None + class VehicleRegistration(Contract): name_primary: str name_secondary: str @@ -162,4 +310,7 @@ def test_split_requires_splitter(): # Act & Assert with pytest.raises(ValueError, match="No splitter loaded"): - process.split([]) # Empty classifications list is fine for this test \ No newline at end of file + process.split([]) # Empty classifications list is fine for this test + +if __name__ == "__main__": + test_mask() \ No newline at end of file