diff --git a/app/main.py b/app/main.py index 427e3cc..a59abcc 100644 --- a/app/main.py +++ b/app/main.py @@ -2,11 +2,21 @@ from io import BytesIO from pathlib import Path -from fastapi import FastAPI, UploadFile, File, Form, HTTPException +from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Body from fastapi.responses import JSONResponse from dotenv import load_dotenv from PIL import Image +from .services.sentence_segmenter import split_sentences +from .services.tokenizer import tokenize +from .services.openai_eval import evaluate_segmentation_with_openai + +# 스키마 +from .schemas.sentences import SentencesRequest, SentencesResponse +from .schemas.tokens import TokensRequest, TokensResponse +from .schemas.validate import ValidateRequest, ValidateResponse + + from .schemas.ocr import OCRResponse from .services.vision_ocr import ( encode_bytes_to_b64, @@ -91,3 +101,50 @@ async def ocr_image( raise except Exception as e: raise HTTPException(status_code=500, detail=f"서버 오류: {e}") + + +# --- 문장 분리 전용 --- +@app.post("/sentences", response_model=SentencesResponse) +async def sentences_endpoint(payload: SentencesRequest = Body(...)): + """ + 입력 텍스트(또는 문단 배열)를 문장 단위로만 분리 + """ + if not (payload.text or payload.paragraphs): + raise HTTPException(status_code=400, detail="text 또는 paragraphs 중 하나는 필요합니다.") + + # 전체 원문 + whole_text = payload.text or " ".join(payload.paragraphs or []) + # 문장 분리 + if payload.paragraphs: + sents = [] + for para in payload.paragraphs: + sents.extend(split_sentences(para)) + else: + sents = split_sentences(whole_text) + + return SentencesResponse(text=whole_text, sentences=sents) + +# --- 토큰화 전용 --- +@app.post("/tokens", response_model=TokensResponse) +async def tokens_endpoint(payload: TokensRequest = Body(...)): + """ + 문장 리스트를 받아 각 문장을 단어(토큰)로만 분리 + (문장 분리는 이 엔드포인트에서 하지 않음) + """ + if not payload.sentences: + raise HTTPException(status_code=400, detail="sentences가 비어있습니다.") + tokens_per_sentence = [tokenize(s) for s in payload.sentences] + return TokensResponse(tokens_per_sentence=tokens_per_sentence) + + +@app.post("/sentences/validate", response_model=ValidateResponse) +async def validate_sentences(payload: ValidateRequest = Body(...)): + try: + result = evaluate_segmentation_with_openai( + text=payload.text, + sentences=payload.sentences, + model="gpt-4.1-mini", + ) + return ValidateResponse(**result) + except Exception as e: + raise HTTPException(status_code=500, detail=f"OpenAI 검증 실패: {e}") \ No newline at end of file diff --git a/app/schemas/sentences.py b/app/schemas/sentences.py new file mode 100644 index 0000000..c60971b --- /dev/null +++ b/app/schemas/sentences.py @@ -0,0 +1,12 @@ +from typing import List, Optional +from pydantic import BaseModel + +class SentencesRequest(BaseModel): + # 하나만 써도 되고, 둘 다 쓰면 paragraphs를 우선 사용 + text: Optional[str] = None + +class SentencesResponse(BaseModel): + # 입력 전체를 하나로 합친 원문(요약용) + text: str + # 문장 리스트 + sentences: List[str] diff --git a/app/schemas/tokens.py b/app/schemas/tokens.py new file mode 100644 index 0000000..eec8549 --- /dev/null +++ b/app/schemas/tokens.py @@ -0,0 +1,8 @@ +from typing import List +from pydantic import BaseModel + +class TokensRequest(BaseModel): + sentences: List[str] # 문장 분리는 이미 끝났다는 가정 + +class TokensResponse(BaseModel): + tokens_per_sentence: List[List[str]] diff --git a/app/schemas/validate.py b/app/schemas/validate.py new file mode 100644 index 0000000..d8f37db --- /dev/null +++ b/app/schemas/validate.py @@ -0,0 +1,14 @@ +from pydantic import BaseModel +from typing import List, Optional, Any, Dict + +class ValidateRequest(BaseModel): + text: str + sentences: List[str] + +class ValidateResponse(BaseModel): + score: float + verdict: str + issues: List[str] + suggestions: List[str] + fixed_sentences: List[str] + raw: str diff --git a/app/services/openai_eval.py b/app/services/openai_eval.py new file mode 100644 index 0000000..20cce77 --- /dev/null +++ b/app/services/openai_eval.py @@ -0,0 +1,49 @@ +import os, json +from typing import List, Dict, Any, Optional +from openai import OpenAI + +def _client() -> OpenAI: + api_key = os.getenv("OPENAI_API_KEY") + if not api_key: + raise RuntimeError("환경변수 OPENAI_API_KEY가 필요합니다.") + return OpenAI(api_key=api_key) + +def evaluate_segmentation_with_openai( + text: str, + sentences: List[str], + model: str = "gpt-4.1-mini", +) -> Dict[str, Any]: + """ + 원문 text와 분리된 sentences를 넘기면, + - 점수(0~1), 판단(OK/REVISE), 문제 목록, 추천 수정, 수정된 문장 배열을 JSON으로 반환 + """ + client = _client() + sys = ( + "You are a careful Korean text segmentation evaluator. " + "Assess whether the given list of sentences is a correct segmentation " + "of the original text. Respond in JSON with keys: " + "`score` (0~1 float), `verdict` ('OK'|'REVISE'), " + "`issues` (string[]), `suggestions` (string[]), `fixed_sentences` (string[]). " + "Keep `fixed_sentences` length close to the original sentences length; merge/split as needed." + ) + user = { + "original_text": text, + "sentences": sentences, + } + resp = client.chat.completions.create( + model=model, + response_format={"type": "json_object"}, + messages=[ + {"role": "system", "content": sys}, + {"role": "user", "content": json.dumps(user, ensure_ascii=False)}, + ], + temperature=0.1, + ) + content = resp.choices[0].message.content + try: + data = json.loads(content) + except Exception: + data = {"score": 0.0, "verdict": "REVISE", "issues": ["JSON parse error"], "suggestions": [], "fixed_sentences": []} + data["model"] = model + data["raw"] = content + return data diff --git a/app/services/sentence_segmenter.py b/app/services/sentence_segmenter.py new file mode 100644 index 0000000..c1da422 --- /dev/null +++ b/app/services/sentence_segmenter.py @@ -0,0 +1,49 @@ +import re +from typing import List + +# 종결부호와 닫힘 따옴표 +_SENT_END = r"[\.!\?…]+" +_RIGHT_QUOTE = r"[\"'”’]?" + +# lookbehind 없이 캡처: 종결부호(+닫힘 따옴표)까지 포함하거나 입력 끝까지 +_SENT_CAPTURE_RE = re.compile(rf"[^\.!\?…]+?(?:{_SENT_END}{_RIGHT_QUOTE}|$)") + +def _normalize_spaces_keep_newline(text: str) -> str: + # 개행은 유지, 공백만 정리 + text = re.sub(r"[ \t]+", " ", text) + text = re.sub(r" \n+", "\n", text) + text = re.sub(r"\n+ ", "\n", text) + return text.strip() + +def _tidy(sentence: str) -> str: + # 중복 공백, 구두점 앞 공백 제거 + s = re.sub(r"\s+", " ", sentence) + s = re.sub(r"\s+([,\.!\?…])", r"\1", s) + return s.strip() + +def split_sentences(text: str, split_on_newline: bool = False) -> List[str]: + """ + 텍스트 → 문장 리스트 + - 기본: 종결부호(. ! ? …) 기준 + - split_on_newline=True: 종결부호가 없을 땐 \n도 문장 경계로 보조 분리 + """ + if not text: + return [] + t = _normalize_spaces_keep_newline(text) + + # 문장 캡처 + raw_parts = _SENT_CAPTURE_RE.findall(t) + parts: List[str] = [] + for p in raw_parts: + p = p.strip() + if not p: + continue + if split_on_newline and ("\n" in p) and not re.search(_SENT_END, p): + # 종결부호 없는 조각은 개행으로 보조 분리 + for seg in re.split(r"\n+", p): + seg = seg.strip() + if seg: + parts.append(_tidy(seg)) + else: + parts.append(_tidy(p)) + return parts diff --git a/app/services/tokenizer.py b/app/services/tokenizer.py new file mode 100644 index 0000000..ec98949 --- /dev/null +++ b/app/services/tokenizer.py @@ -0,0 +1,10 @@ +import re +from typing import List + +# 한글/영문/숫자는 묶고, 그 외 기호는 단독 토큰 +_TOKEN_RE = re.compile(r"[가-힣]+|[A-Za-z]+(?:'[A-Za-z]+)?|[0-9]+|[^\sA-Za-z0-9가-힣]") + +def tokenize(sentence: str) -> List[str]: + if not sentence: + return [] + return [m.group(0) for m in _TOKEN_RE.finditer(sentence)]