Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
3 changes: 3 additions & 0 deletions model/fineTuning/GPU_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import torch
print(torch.cuda.is_available()) # True가 출력되면 GPU 사용 가능
print(torch.cuda.get_device_name(0)) # GPU 이름 출력
25 changes: 25 additions & 0 deletions model/fineTuning/baseModel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# 베이스 모델 테스트
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# 모델과 토크나이저 불러오기
model_name = "timpal0l/mdeberta-v3-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# 질문과 문서 설정
question = "둘 중에 무엇으로 할래요?"
context = "남자는 여자에게 내일 메뉴에 대해 물었고 여자는 치킨이랑 피자 중에서 고르라고 했다."

# 입력 데이터 토큰화
inputs = tokenizer(question, context, return_tensors="pt")

# 모델로부터 예측 결과 받기
outputs = model(**inputs)
answer_start_index = torch.argmax(outputs.start_logits)
answer_end_index = torch.argmax(outputs.end_logits) + 1

# 예측된 답변 토큰을 문자열로 변환
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start_index:answer_end_index]))

print(f"Answer: {answer}")
58 changes: 58 additions & 0 deletions model/fineTuning/data/1.Training/clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import json

merged_file_path = "fineTuning/data/1.Training/merged_train_data.json"

# JSON 데이터 정리 함수
def clean_json_data(file_path):
try:
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
except json.JSONDecodeError as e:
print(f"JSONDecodeError: {e}")
return None
except Exception as e:
print(f"Unexpected error: {e}")
return None

cleaned_data = []
for i, entry in enumerate(data):
if not entry: # entry가 None인 경우 제외
print(f"Warning: Entry {i} is None. Skipping...")
continue

# 필수 필드 확인 및 기본 값 추가
if "context" not in entry or not entry["context"]:
print(f"Warning: Entry {i} missing 'context'. Skipping...")
continue # context가 없는 데이터는 제외
if "question" not in entry or not entry["question"]:
print(f"Warning: Entry {i} missing 'question'. Skipping...")
continue # question이 없는 데이터는 제외
if "answers" not in entry or not isinstance(entry["answers"], dict):
print(f"Warning: Entry {i} missing or invalid 'answers'. Setting default values...")
entry["answers"] = {"text": [""], "answer_start": [0]} # 기본 값 추가
elif not entry["answers"]["text"]: # answers["text"]가 비어 있는 경우
print(f"Warning: Entry {i} has empty 'answers'. Setting default values...")
entry["answers"]["text"] = [""]
entry["answers"]["answer_start"] = [0]

# 정리된 데이터를 추가
cleaned_data.append(entry)

# 정리된 데이터를 새로운 JSON 파일로 저장
cleaned_file_path = "fineTuning/data/1.Training/cleaned_train_data.json"
try:
with open(cleaned_file_path, "w", encoding="utf-8") as cleaned_file:
json.dump(cleaned_data, cleaned_file, ensure_ascii=False, indent=4)
print(f"Cleaned data saved to: {cleaned_file_path}")
except Exception as e:
print(f"Error saving cleaned data: {e}")
return None

return cleaned_file_path

# JSON 데이터 정리 수행
cleaned_file_path = clean_json_data(merged_file_path)
if cleaned_file_path:
print(f"Cleaned file path: {cleaned_file_path}")
else:
print("Failed to clean and save JSON data.")
408,638 changes: 408,638 additions & 0 deletions model/fineTuning/data/1.Training/cleaned_train_data.json

Large diffs are not rendered by default.

18 changes: 18 additions & 0 deletions model/fineTuning/data/1.Training/dataMerge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import os
import json

folder_path = "fineTuning/data/1.Training/labeled_data"

merged_data = []

for file_name in os.listdir(folder_path):
if file_name.endswith(".json"):
file_path = os.path.join(folder_path, file_name)
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
merged_data.extend(data)

output_file = "merged_train_data.json"
with open(output_file, "w", encoding="utf-8") as file:
json.dump(merged_data, file, ensure_ascii=False, indent=4)

24,821 changes: 24,821 additions & 0 deletions model/fineTuning/data/1.Training/labeled_data/civil complaint1_2497.json

Large diffs are not rendered by default.

1,010 changes: 1,010 additions & 0 deletions model/fineTuning/data/1.Training/labeled_data/civil complaint2_0695.json

Large diffs are not rendered by default.

26,320 changes: 26,320 additions & 0 deletions model/fineTuning/data/1.Training/labeled_data/civil complaint3_2614.json

Large diffs are not rendered by default.

17,703 changes: 17,703 additions & 0 deletions model/fineTuning/data/1.Training/labeled_data/civil complaint4_2095.json

Large diffs are not rendered by default.

32,403 changes: 32,403 additions & 0 deletions model/fineTuning/data/1.Training/labeled_data/education1_3320.json

Large diffs are not rendered by default.

19,178 changes: 19,178 additions & 0 deletions model/fineTuning/data/1.Training/labeled_data/education2_2217.json

Large diffs are not rendered by default.

11,546 changes: 11,546 additions & 0 deletions model/fineTuning/data/1.Training/labeled_data/education3_1181.json

Large diffs are not rendered by default.

10,792 changes: 10,792 additions & 0 deletions model/fineTuning/data/1.Training/labeled_data/education4_1120.json

Large diffs are not rendered by default.

28,773 changes: 28,773 additions & 0 deletions model/fineTuning/data/1.Training/labeled_data/shopping1_3030.json

Large diffs are not rendered by default.

29,656 changes: 29,656 additions & 0 deletions model/fineTuning/data/1.Training/labeled_data/shopping2_3077.json

Large diffs are not rendered by default.

23,382 changes: 23,382 additions & 0 deletions model/fineTuning/data/1.Training/labeled_data/shopping3_2356.json

Large diffs are not rendered by default.

23,276 changes: 23,276 additions & 0 deletions model/fineTuning/data/1.Training/labeled_data/shopping4_2349.json

Large diffs are not rendered by default.

22,757 changes: 22,757 additions & 0 deletions model/fineTuning/data/1.Training/labeled_data/shopping5_2314.json

Large diffs are not rendered by default.

11,860 changes: 11,860 additions & 0 deletions model/fineTuning/data/1.Training/labeled_data/shopping6_1194.json

Large diffs are not rendered by default.

25,590 changes: 25,590 additions & 0 deletions model/fineTuning/data/1.Training/labeled_data/shopping7_2540.json

Large diffs are not rendered by default.

16,826 changes: 16,826 additions & 0 deletions model/fineTuning/data/1.Training/labeled_data/tourism1_1621.json

Large diffs are not rendered by default.

10,694 changes: 10,694 additions & 0 deletions model/fineTuning/data/1.Training/labeled_data/tourism2_1110.json

Large diffs are not rendered by default.

17,258 changes: 17,258 additions & 0 deletions model/fineTuning/data/1.Training/labeled_data/tourism3_1657.json

Large diffs are not rendered by default.

27,651 changes: 27,651 additions & 0 deletions model/fineTuning/data/1.Training/labeled_data/tourism4_2724.json

Large diffs are not rendered by default.

27,278 changes: 27,278 additions & 0 deletions model/fineTuning/data/1.Training/labeled_data/tourism5_2692.json

Large diffs are not rendered by default.

Loading