Skip to content

Statement Normalization Pipeline using OpenAI #24

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
d52ae27
create clean_statements.py script
dankim444 Jul 18, 2024
32750da
Merge branch 'main' into 150-inconsistent-statements
dankim444 Jul 19, 2024
9cb6e6a
Merge branch 'main' of github.com:Watts-Lab/commonsense-statements in…
dankim444 Jul 26, 2024
879df18
add script for normalizing statements
dankim444 Jul 31, 2024
2299a5e
add workflows
dankim444 Jul 31, 2024
e387dd4
normalize statements
dankim444 Jul 31, 2024
aa84242
standardize statements
dankim444 Aug 9, 2024
85e0dde
remove redundant statement files
dankim444 Aug 9, 2024
e8bfbfa
remove duplicates caused by normalization
dankim444 Aug 9, 2024
8488b26
update workflows
dankim444 Aug 9, 2024
fc390f3
implement openai normalization script
dankim444 Aug 15, 2024
d4ca500
normalize news statements and observable statements
dankim444 Aug 15, 2024
993d7f3
integrate normalization into workflows
dankim444 Aug 15, 2024
a2e38cf
fix format checking error
dankim444 Aug 16, 2024
fcf2066
update requirements
dankim444 Aug 16, 2024
5adce1d
clean email statements and change model to 3.5-turbo
dankim444 Aug 17, 2024
dc111dd
update calculate_translation_cost.py
dankim444 Aug 17, 2024
18b5d6b
Merge branch 'main' of github.com:Watts-Lab/commonsense-statements in…
dankim444 Aug 17, 2024
5805084
update calculate translation script
dankim444 Aug 17, 2024
de3996f
minor changes
dankim444 Aug 17, 2024
9476833
add english language code as suffix to original_statements and cleane…
dankim444 Aug 17, 2024
15fdcae
update readme
dankim444 Aug 17, 2024
6847958
switch model from 3.5-turbo to 4o
dankim444 Aug 19, 2024
de6dfcc
Merge branch 'main' into 150-inconsistent-statements
amirrr Oct 3, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 45 additions & 2 deletions .github/workflows/translate_and_remove_duplicates.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on:
workflow_dispatch:
inputs:
files:
description: 'Comma-separated list of files to translate in the raw_statements folder'
description: 'Comma-separated list of files to translate in the raw_statements folder (ie. bob_statements_en.csv, jane_statements_en.csv)'
required: true
elicitation:
description: 'Elicitation method'
Expand Down Expand Up @@ -69,7 +69,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pandas
pip install - r requirements.txt

- name: Extract duplicate groups (if any)
run: |
Expand All @@ -93,3 +93,46 @@ jobs:
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
commit_message: remove duplicate statements

normalize-statements:
runs-on: ubuntu-latest
needs: check_and_remove_duplicates

env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Python 3.11
uses: actions/setup-python@v5
with:
python-version: '3.11'

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install openai pandas

- name: Run normalize_statements_openai.py script
run: |
python ./.scripts/normalize_statements_openai.py

- name: Remove any duplicates caused by normalization
run: |
python ./.scripts/remove_duplicates_after_normalization.py

- name: Final format check
run: |
python ./.scripts/format_checker.py

- name: Final dimension check
run: |
python ./.scripts/dimension_checker.py

- name: Commit and push changes
uses: devops-infra/action-commit-push@master
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
commit_message: normalize statements
29 changes: 15 additions & 14 deletions .scripts/calculate_translation_cost.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
import boto3
from botocore.config import Config
import sys
import re

# define the supported languages
languages = ["ar", "bn", "es", "fr", "hi", "ja", "pt", "ru", "zh"]

# set up credentials
aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"]
aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]
aws_access_key_id = os.environ.get("AWS_ACCESS_KEY_ID")
aws_secret_access_key = os.environ.get("AWS_SECRET_ACCESS_KEY")

# set up configuration details
my_config = Config(
Expand Down Expand Up @@ -75,15 +76,17 @@ def total_cost(directory):
count = 0
for filename in os.listdir(directory):
count += 1
if filename.endswith(".csv") and (
filename.split(".")[0].split("_")[-1] in languages
):
files_translated.add(
"_".join(filename.split(".")[0].split("_")[:-1]) + ".csv"
)
files_translated.add(filename)


lng_match = re.search(r'_([a-z]{2})(?:_cleaned)?\.csv$', filename) # extract language code from filename
lng = lng_match.group(1) if lng_match else None
if filename.endswith('.csv') and lng in languages:
match = re.search(r'(.*)_[a-z]{2}(?:_cleaned)?\.csv', filename) # extract the part before the language code
if match:
base = match.group(1)
files_translated.add(f'{base}_{lng}.csv') # files either have the _cleaned suffix or not
files_translated.add(f'{base}_{lng}_cleaned.csv')
files_translated.add(f'{base}_en.csv')
files_translated.add(f'{base}_en_cleaned.csv')

print("Translation Cost Calculation")
print("----------------------------------------")

Expand All @@ -97,9 +100,7 @@ def total_cost(directory):
df = pd.read_csv(filepath)

# Amazon Translate pricing: https://aws.amazon.com/translate/pricing/
total_characters_for_file = df["statement"].apply(len).sum() * len(
languages
)
total_characters_for_file = df["statement"].apply(len).sum() * len(languages)

try:
cost_per_character = get_price_per_character()
Expand Down
112 changes: 112 additions & 0 deletions .scripts/normalize_statements_openai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import openai
import os
import re
import pandas as pd

# set up OpenAI credentials
openai.api_key = os.environ.get('OPENAI_API_KEY')

# define supported languages
languages = {
'ar': 'Arabic',
'bn': 'Bengali',
'en': 'English',
'es': 'Spanish',
'fr': 'French',
'hi': 'Hindi',
'ja': 'Japanese',
'pt': 'Portuguese',
'ru': 'Russian',
'zh': 'Chinese'
}

# function that sets up API
def chat_completion_function(lng, user_prompt):
system_prompt = f"""
You are a language processing assistant. Given this statement in {languages[lng]}, perform the following steps:
1. Capitalize the first letter (if applicable for the language).
2. Remove leading and trailing punctuation.
3. Ensure the sentence ends with the appropriate full-stop punctuation native to the language (e.g., । for Bengali and Hindi, 。 for Japanese and Chinese).
4. Do not change the capitalization of any other words within the sentence besides the first word, including proper nouns, event names, and common words—preserve all original capitalization as given by the user.
5. Do not introduce any changes to the vocabulary, meaning, or phrasing. Preserve the exact words provided.
6. Do not introduce any additional punctuation, symbols, or special characters.

Return only the cleaned statement in the same format given by the user, without any additional text, explanations, or changes to the proper nouns' capitalization or vocabulary. Here are some examples in 10 different languages:
- '"Social Security and Medicare are programs that politicians can protect'" should be '"Social Security and Medicare are programs that politicians can protect."'
- "el Seguro Social y Medicare son programas que los políticos pueden proteger" should be "El seguro social y medicare son programas que los políticos pueden proteger."
- "সামাজিক নিরাপত্তা এবং মেডিকেয়ার হল এমন প্রোগ্রাম যা রাজনীতিবিদরা রক্ষা করতে পারেন" should be "সামাজিক নিরাপত্তা এবং মেডিকেয়ার হল এমন প্রোগ্রাম যা রাজনীতিবিদরা রক্ষা করতে পারেন।"
- "社会保障和医疗保险是政治家可以保护的计划" should be "社会保障和医疗保险是政客可以保护的计划。"
- "إن الضمان الاجتماعي والرعاية الطبية من البرامج التي يمكن للسياسيين حمايتها" should be "إن الضمان الاجتماعي والرعاية الطبية من البرامج التي يمكن للسياسيين حمايتها."
- "la sécurité sociale et Medicare sont des programmes que les politiciens peuvent protéger" should be "La sécurité sociale et Medicare sont des programmes que les politiciens peuvent protéger."
- "सामाजिक सुरक्षा और मेडिकेयर ऐसे कार्यक्रम हैं जिनकी रक्षा राजनेता कर सकते हैं" should be "सामाजिक सुरक्षा और मेडिकेयर ऐसे कार्यक्रम हैं जिनकी रक्षा राजनेता कर सकते हैं।"
- "社会保障とメディケアは政治家が守ることができる制度である" should be "社会保障とメディケアは政治家が守ることができるプログラムです。"
- "Социальное обеспечение и Medicare — это программы, которые политики могут защитить" should be "Социальное обеспечение и Medicare — это программы, которые политики могут защитить."
- "a Segurança Social e o Medicare são programas que os políticos podem proteger" should be "A Segurança Social e o Medicare são programas que os políticos podem proteger."
"""
try:
completion = openai.chat.completions.create(
model='gpt-4o',
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
)

response_content = completion.choices[0].message.content
return response_content.strip()
except Exception as e:
print(f"Error: {e}")
return None

# process files in raw_statements directory
def process_files(directory):
files_cleaned = set()
count = 0

# check if all files have already been cleaned
for filename in os.listdir(directory):
if filename.endswith('.csv'):
count += 1
if (filename.split('.')[0].split('_')[-1] == 'cleaned'):
files_cleaned.add(filename)

if count == len(files_cleaned):
print("All files have already been cleaned.")
return

for filename in os.listdir(directory):
if filename.endswith('.csv') and filename not in files_cleaned:
# extract the language code from the file
lng_group = re.search(r'_([a-z]{2})(?:_cleaned)?\.csv$', filename)

if lng_group:
lng = lng_group.group(1)
print(f"Detected language: {languages[lng]}")
else:
print(f"Error: Unable to determine language for {filename}")
continue

filepath = os.path.join(directory, filename)
df = pd.read_csv(filepath)
cleaned_statements = []

print(f"Processing {filename}...")

# clean each statement in the file
for statement in df['statement']:
cleaned_statement = chat_completion_function(lng, statement)
print(cleaned_statement)
cleaned_statements.append(cleaned_statement)

df['statement'] = cleaned_statements

# save cleaned data back to the file
df.to_csv(filepath, index=False)
cleaned_filename = f"{os.path.splitext(filename)[0]}_cleaned.csv"
cleaned_filepath = os.path.join(directory, cleaned_filename)
os.rename(filepath, cleaned_filepath)
print(f"Cleaned data saved back to {filepath} and renamed file to {cleaned_filepath}")


if __name__ == '__main__':
process_files('raw_statements')
2 changes: 1 addition & 1 deletion .scripts/remove_duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import argparse

"""
Function to remove duplicate statements across files and their corresponding translation files
Function to remove duplicate statements caused by translation across files and their corresponding translation files

Parameters:
- original_csv: path to the original CSV file containing English statements
Expand Down
46 changes: 46 additions & 0 deletions .scripts/remove_duplicates_after_normalization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import pandas as pd
import os

languages = ['ar','bn','en','es','fr','hi','ja','pt','ru','zh']

"""
Function that removes duplicates caused after running the normalize_statements.py script

input: the directory to process the files where duplicates occur
"""
def remove_duplicates_after_normalization(directory):
base_names = set()
for filename in os.listdir(directory):
if filename.endswith('_cleaned.csv'):
base = '_'.join(filename.split('.')[0].split('_')[:-2])
base_names.add(base)

# process files with the same base
for base in base_names:
files = [f'{base}_{lng}_cleaned.csv' for lng in languages]

# identify indices to drop
indices_to_drop = set()
for filename in files:
filepath = os.path.join(directory, filename)
print(f'Checking for duplicates in {filename}...')
df = pd.read_csv(filepath)
duplicates = df.duplicated(keep='first')
dropped_indices = df.index[duplicates]
indices_to_drop.update(dropped_indices.tolist())

# drop the indices across all files with the same base name
if indices_to_drop:
for filename in files:
filepath = os.path.join(directory, filename)
df = pd.read_csv(filepath)
df.drop(list(indices_to_drop), inplace=True)
df.reset_index(drop=True, inplace=True)
df.to_csv(filepath, index=False)
print(f'dropped {list(indices_to_drop)} in {filename}')
else:
print(f'No duplicates found in {base}_en.csv and corresponding translation files.')


if __name__ == '__main__':
remove_duplicates_after_normalization('raw_statements')
5 changes: 2 additions & 3 deletions .scripts/show_groups_of_duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ def show_groups_of_duplicates(directory):
filepath = os.path.join(directory, filename)
df = pd.read_csv(filepath)
df['line_number'] = df.index + 2 # +2 to account for the header row and 0-based index
df['lng'] = os.path.basename(filename).split('_')[-1].split('.')[0]

duplicates = df[df.duplicated('statement', keep=False)] # filter duplicate statements found in each file

Expand All @@ -39,13 +38,13 @@ def show_groups_of_duplicates(directory):
group['source_file'] = filename
group['group_number'] = group_number
group_number += 1
all_duplicates.append(group[['source_file', 'group_number', 'lng', 'line_number', 'statement']])
all_duplicates.append(group[['source_file', 'group_number', 'line_number', 'statement']])

if all_duplicates:
result_df = pd.concat(all_duplicates)

# group by group number and merge line_number and statement columns into a dictionary
result_df = result_df.groupby(['source_file', 'group_number', 'lng']).apply(
result_df = result_df.groupby(['source_file', 'group_number']).apply(
lambda x: x.set_index('line_number')['statement'].to_dict(), include_groups=False
).reset_index().rename(columns={0: 'line_and_statement'})

Expand Down
25 changes: 13 additions & 12 deletions .scripts/translate_statements_aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
import boto3
import argparse
import swifter
import re

# set up credentials
aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"]
aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]
region_name = "us-east-1"
aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
region_name = 'us-east-1'

# initialize amazon translate client
translate_client = boto3.client(
Expand Down Expand Up @@ -60,20 +61,20 @@ def translate_files(files, elicitation, committer):
for lng in languages:
print(f"Translating to {lng}...")

translated_statements = df["statement"].swifter.progress_bar(enable=True).apply(
translate_text, language=lng
)

total_characters += (
df["statement"].apply(len).sum()
) # accumluate the total number of characters translated
translated_statements = df["statement"].swifter.progress_bar(enable=True).apply(lambda x: translate_text(x, lng))
total_characters += df['statement'].apply(len).sum() # accumluate the total number of characters translated
translated_df = df.copy()
translated_df["statement"] = translated_statements
translated_df["elicitation"] = elicitation
translated_df["committer"] = committer

filename = os.path.splitext(file)[0]
translated_file = f"{filename}_{lng}.csv"
basename = os.path.basename(file) # ie. raw_statements/email_statements_en.csv --> email_statements_en.csv
match = re.search(r'(.*)_[a-z]{2}(?:_cleaned)?\.csv', basename) # extract the part before the language code
if match:
filename = match.group(1)
else:
filename = None
translated_file = f'raw_statements/{filename}_{lng}.csv' # e.g. raw_statements/email_statements_ar.csv
translated_df.to_csv(translated_file, index=False)
print(f"Translated {file} to {lng} and saved as {translated_file}")

Expand Down
16 changes: 10 additions & 6 deletions .scripts/translate_statements_azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import argparse

# set up authentication key and endpoint
azure_key = os.environ['AZURE_TRANSLATE_SERVICE_KEY']
azure_key = os.environ.get('AZURE_TRANSLATE_SERVICE_KEY')
endpoint = 'https://api.cognitive.microsofttranslator.com/'
location = 'eastus'

Expand Down Expand Up @@ -60,18 +60,22 @@ def translate_files(files, elicitation, committer):
for file in files:
df = pd.read_csv(file)
for lng in languages:
translated_statements = df['statement'].apply(lambda x: translate_text(x, lng))
translated_statements = df["statement"].swifter.progress_bar(enable=True).apply(lambda x: translate_text(x, lng))
total_characters += df['statement'].apply(len).sum() # accumluate the total number of characters translated
translated_df = df.copy()
translated_df['statement'] = translated_statements
translated_df['elicitation'] = elicitation
translated_df['committer'] = committer

filename = os.path.splitext(file)[0]
translated_file = f'{filename}_{lng}.csv'
basename = os.path.basename(file) # ie. raw_statements/email_statements_en.csv --> email_statements_en.csv
match = re.search(r'(.*)_[a-z]{2}(?:_cleaned)?\.csv', basename) # extract the part before the language code
if match:
filename = match.group(1)
else:
filename = None
translated_file = f'raw_statements/{filename}_{lng}.csv' # e.g. raw_statements/email_statements_ar.csv
translated_df.to_csv(translated_file, index=False)

print(f'Translated {file} to {lng} and saved as {translated_file}')
print(f"Translated {file} to {lng} and saved as {translated_file}")

return total_characters

Expand Down
Loading
Loading