FullMoo0n · minhyuk2 · Aug 18, 2025 · Aug 18, 2025 · Aug 18, 2025
@@ -2,11 +2,21 @@
 from io import BytesIO
 from pathlib import Path
 
-from fastapi import FastAPI, UploadFile, File, Form, HTTPException
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Body
 from fastapi.responses import JSONResponse
 from dotenv import load_dotenv
 from PIL import Image
 
+from .services.sentence_segmenter import split_sentences
+from .services.tokenizer import tokenize
+from .services.openai_eval import evaluate_segmentation_with_openai
+
+# 스키마
+from .schemas.sentences import SentencesRequest, SentencesResponse
+from .schemas.tokens import TokensRequest, TokensResponse
+from .schemas.validate import ValidateRequest, ValidateResponse
+
+
 from .schemas.ocr import OCRResponse
 from .services.vision_ocr import (
     encode_bytes_to_b64,
@@ -91,3 +101,50 @@ async def ocr_image(
         raise
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"서버 오류: {e}")
+
+
+# --- 문장 분리 전용 ---
+@app.post("/sentences", response_model=SentencesResponse)
+async def sentences_endpoint(payload: SentencesRequest = Body(...)):
+    """
+    입력 텍스트(또는 문단 배열)를 문장 단위로만 분리
+    """
+    if not (payload.text or payload.paragraphs):
+        raise HTTPException(status_code=400, detail="text 또는 paragraphs 중 하나는 필요합니다.")
+
+    # 전체 원문
+    whole_text = payload.text or " ".join(payload.paragraphs or [])
+    # 문장 분리
+    if payload.paragraphs:
+        sents = []
+        for para in payload.paragraphs:
+            sents.extend(split_sentences(para))
+    else:
+        sents = split_sentences(whole_text)
+
+    return SentencesResponse(text=whole_text, sentences=sents)
+
+# --- 토큰화 전용 ---
+@app.post("/tokens", response_model=TokensResponse)
+async def tokens_endpoint(payload: TokensRequest = Body(...)):
+    """
+    문장 리스트를 받아 각 문장을 단어(토큰)로만 분리
+    (문장 분리는 이 엔드포인트에서 하지 않음)
+    """
+    if not payload.sentences:
+        raise HTTPException(status_code=400, detail="sentences가 비어있습니다.")
+    tokens_per_sentence = [tokenize(s) for s in payload.sentences]
+    return TokensResponse(tokens_per_sentence=tokens_per_sentence)
+
+
+@app.post("/sentences/validate", response_model=ValidateResponse)
+async def validate_sentences(payload: ValidateRequest = Body(...)):
+    try:
+        result = evaluate_segmentation_with_openai(
+            text=payload.text,
+            sentences=payload.sentences,
+            model="gpt-4.1-mini",
+        )
+        return ValidateResponse(**result)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"OpenAI 검증 실패: {e}")
@@ -0,0 +1,12 @@
+from typing import List, Optional
+from pydantic import BaseModel
+
+class SentencesRequest(BaseModel):
+    # 하나만 써도 되고, 둘 다 쓰면 paragraphs를 우선 사용
+    text: Optional[str] = None
+
+class SentencesResponse(BaseModel):
+    # 입력 전체를 하나로 합친 원문(요약용)
+    text: str
+    # 문장 리스트
+    sentences: List[str]
@@ -0,0 +1,8 @@
+from typing import List
+from pydantic import BaseModel
+
+class TokensRequest(BaseModel):
+    sentences: List[str]  # 문장 분리는 이미 끝났다는 가정
+
+class TokensResponse(BaseModel):
+    tokens_per_sentence: List[List[str]]
@@ -0,0 +1,14 @@
+from pydantic import BaseModel
+from typing import List, Optional, Any, Dict
+
+class ValidateRequest(BaseModel):
+    text: str
+    sentences: List[str]
+
+class ValidateResponse(BaseModel):
+    score: float
+    verdict: str
+    issues: List[str]
+    suggestions: List[str]
+    fixed_sentences: List[str]
+    raw: str
@@ -0,0 +1,49 @@
+import os, json
+from typing import List, Dict, Any, Optional
+from openai import OpenAI
+
+def _client() -> OpenAI:
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        raise RuntimeError("환경변수 OPENAI_API_KEY가 필요합니다.")
+    return OpenAI(api_key=api_key)
+
+def evaluate_segmentation_with_openai(
+    text: str,
+    sentences: List[str],
+    model: str = "gpt-4.1-mini",
+) -> Dict[str, Any]:
+    """
+    원문 text와 분리된 sentences를 넘기면,
+    - 점수(0~1), 판단(OK/REVISE), 문제 목록, 추천 수정, 수정된 문장 배열을 JSON으로 반환
+    """
+    client = _client()
+    sys = (
+        "You are a careful Korean text segmentation evaluator. "
+        "Assess whether the given list of sentences is a correct segmentation "
+        "of the original text. Respond in JSON with keys: "
+        "`score` (0~1 float), `verdict` ('OK'|'REVISE'), "
+        "`issues` (string[]), `suggestions` (string[]), `fixed_sentences` (string[]). "
+        "Keep `fixed_sentences` length close to the original sentences length; merge/split as needed."
+    )
+    user = {
+        "original_text": text,
+        "sentences": sentences,
+    }
+    resp = client.chat.completions.create(
+        model=model,
+        response_format={"type": "json_object"},
+        messages=[
+            {"role": "system", "content": sys},
+            {"role": "user", "content": json.dumps(user, ensure_ascii=False)},
+        ],
+        temperature=0.1,
+    )
+    content = resp.choices[0].message.content
+    try:
+        data = json.loads(content)
+    except Exception:
+        data = {"score": 0.0, "verdict": "REVISE", "issues": ["JSON parse error"], "suggestions": [], "fixed_sentences": []}
+    data["model"] = model
+    data["raw"] = content
+    return data
@@ -0,0 +1,49 @@
+import re
+from typing import List
+
+# 종결부호와 닫힘 따옴표
+_SENT_END = r"[\.!\?…]+"
+_RIGHT_QUOTE = r"[\"'”’]?"
+
+# lookbehind 없이 캡처: 종결부호(+닫힘 따옴표)까지 포함하거나 입력 끝까지
+_SENT_CAPTURE_RE = re.compile(rf"[^\.!\?…]+?(?:{_SENT_END}{_RIGHT_QUOTE}|$)")
+
+def _normalize_spaces_keep_newline(text: str) -> str:
+    # 개행은 유지, 공백만 정리
+    text = re.sub(r"[ \t]+", " ", text)
+    text = re.sub(r" \n+", "\n", text)
+    text = re.sub(r"\n+ ", "\n", text)
+    return text.strip()
+
+def _tidy(sentence: str) -> str:
+    # 중복 공백, 구두점 앞 공백 제거
+    s = re.sub(r"\s+", " ", sentence)
+    s = re.sub(r"\s+([,\.!\?…])", r"\1", s)
+    return s.strip()
+
+def split_sentences(text: str, split_on_newline: bool = False) -> List[str]:
+    """
+    텍스트 → 문장 리스트
+    - 기본: 종결부호(. ! ? …) 기준
+    - split_on_newline=True: 종결부호가 없을 땐 \n도 문장 경계로 보조 분리
+    """
+    if not text:
+        return []
+    t = _normalize_spaces_keep_newline(text)
+
+    # 문장 캡처
+    raw_parts = _SENT_CAPTURE_RE.findall(t)
+    parts: List[str] = []
+    for p in raw_parts:
+        p = p.strip()
+        if not p:
+            continue
+        if split_on_newline and ("\n" in p) and not re.search(_SENT_END, p):
+            # 종결부호 없는 조각은 개행으로 보조 분리
+            for seg in re.split(r"\n+", p):
+                seg = seg.strip()
+                if seg:
+                    parts.append(_tidy(seg))
+        else:
+            parts.append(_tidy(p))
+    return parts
@@ -0,0 +1,10 @@
+import re
+from typing import List
+
+# 한글/영문/숫자는 묶고, 그 외 기호는 단독 토큰
+_TOKEN_RE = re.compile(r"[가-힣]+|[A-Za-z]+(?:'[A-Za-z]+)?|[0-9]+|[^\sA-Za-z0-9가-힣]")
+
+def tokenize(sentence: str) -> List[str]:
+    if not sentence:
+        return []
+    return [m.group(0) for m in _TOKEN_RE.finditer(sentence)]