Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 58 additions & 1 deletion app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,21 @@
from io import BytesIO
from pathlib import Path

from fastapi import FastAPI, UploadFile, File, Form, HTTPException
from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Body
from fastapi.responses import JSONResponse
from dotenv import load_dotenv
from PIL import Image

from .services.sentence_segmenter import split_sentences
from .services.tokenizer import tokenize
from .services.openai_eval import evaluate_segmentation_with_openai

# 스키마
from .schemas.sentences import SentencesRequest, SentencesResponse
from .schemas.tokens import TokensRequest, TokensResponse
from .schemas.validate import ValidateRequest, ValidateResponse


from .schemas.ocr import OCRResponse
from .services.vision_ocr import (
encode_bytes_to_b64,
Expand Down Expand Up @@ -91,3 +101,50 @@ async def ocr_image(
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"서버 오류: {e}")


# --- 문장 분리 전용 ---
@app.post("/sentences", response_model=SentencesResponse)
async def sentences_endpoint(payload: SentencesRequest = Body(...)):
"""
입력 텍스트(또는 문단 배열)를 문장 단위로만 분리
"""
if not (payload.text or payload.paragraphs):
raise HTTPException(status_code=400, detail="text 또는 paragraphs 중 하나는 필요합니다.")

# 전체 원문
whole_text = payload.text or " ".join(payload.paragraphs or [])
# 문장 분리
if payload.paragraphs:
sents = []
for para in payload.paragraphs:
sents.extend(split_sentences(para))
else:
sents = split_sentences(whole_text)

return SentencesResponse(text=whole_text, sentences=sents)

# --- 토큰화 전용 ---
@app.post("/tokens", response_model=TokensResponse)
async def tokens_endpoint(payload: TokensRequest = Body(...)):
"""
문장 리스트를 받아 각 문장을 단어(토큰)로만 분리
(문장 분리는 이 엔드포인트에서 하지 않음)
"""
if not payload.sentences:
raise HTTPException(status_code=400, detail="sentences가 비어있습니다.")
tokens_per_sentence = [tokenize(s) for s in payload.sentences]
return TokensResponse(tokens_per_sentence=tokens_per_sentence)


@app.post("/sentences/validate", response_model=ValidateResponse)
async def validate_sentences(payload: ValidateRequest = Body(...)):
try:
result = evaluate_segmentation_with_openai(
text=payload.text,
sentences=payload.sentences,
model="gpt-4.1-mini",
)
return ValidateResponse(**result)
except Exception as e:
raise HTTPException(status_code=500, detail=f"OpenAI 검증 실패: {e}")
12 changes: 12 additions & 0 deletions app/schemas/sentences.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from typing import List, Optional
from pydantic import BaseModel

class SentencesRequest(BaseModel):
# 하나만 써도 되고, 둘 다 쓰면 paragraphs를 우선 사용
text: Optional[str] = None

class SentencesResponse(BaseModel):
# 입력 전체를 하나로 합친 원문(요약용)
text: str
# 문장 리스트
sentences: List[str]
8 changes: 8 additions & 0 deletions app/schemas/tokens.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from typing import List
from pydantic import BaseModel

class TokensRequest(BaseModel):
sentences: List[str] # 문장 분리는 이미 끝났다는 가정

class TokensResponse(BaseModel):
tokens_per_sentence: List[List[str]]
14 changes: 14 additions & 0 deletions app/schemas/validate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from pydantic import BaseModel
from typing import List, Optional, Any, Dict

class ValidateRequest(BaseModel):
text: str
sentences: List[str]

class ValidateResponse(BaseModel):
score: float
verdict: str
issues: List[str]
suggestions: List[str]
fixed_sentences: List[str]
raw: str
49 changes: 49 additions & 0 deletions app/services/openai_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import os, json
from typing import List, Dict, Any, Optional
from openai import OpenAI

def _client() -> OpenAI:
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise RuntimeError("환경변수 OPENAI_API_KEY가 필요합니다.")
return OpenAI(api_key=api_key)

def evaluate_segmentation_with_openai(
text: str,
sentences: List[str],
model: str = "gpt-4.1-mini",
) -> Dict[str, Any]:
"""
원문 text와 분리된 sentences를 넘기면,
- 점수(0~1), 판단(OK/REVISE), 문제 목록, 추천 수정, 수정된 문장 배열을 JSON으로 반환
"""
client = _client()
sys = (
"You are a careful Korean text segmentation evaluator. "
"Assess whether the given list of sentences is a correct segmentation "
"of the original text. Respond in JSON with keys: "
"`score` (0~1 float), `verdict` ('OK'|'REVISE'), "
"`issues` (string[]), `suggestions` (string[]), `fixed_sentences` (string[]). "
"Keep `fixed_sentences` length close to the original sentences length; merge/split as needed."
)
user = {
"original_text": text,
"sentences": sentences,
}
resp = client.chat.completions.create(
model=model,
response_format={"type": "json_object"},
messages=[
{"role": "system", "content": sys},
{"role": "user", "content": json.dumps(user, ensure_ascii=False)},
],
temperature=0.1,
)
content = resp.choices[0].message.content
try:
data = json.loads(content)
except Exception:
data = {"score": 0.0, "verdict": "REVISE", "issues": ["JSON parse error"], "suggestions": [], "fixed_sentences": []}
data["model"] = model
data["raw"] = content
return data
49 changes: 49 additions & 0 deletions app/services/sentence_segmenter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import re
from typing import List

# 종결부호와 닫힘 따옴표
_SENT_END = r"[\.!\?…]+"
_RIGHT_QUOTE = r"[\"'”’]?"

# lookbehind 없이 캡처: 종결부호(+닫힘 따옴표)까지 포함하거나 입력 끝까지
_SENT_CAPTURE_RE = re.compile(rf"[^\.!\?…]+?(?:{_SENT_END}{_RIGHT_QUOTE}|$)")

def _normalize_spaces_keep_newline(text: str) -> str:
# 개행은 유지, 공백만 정리
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r" \n+", "\n", text)
text = re.sub(r"\n+ ", "\n", text)
return text.strip()

def _tidy(sentence: str) -> str:
# 중복 공백, 구두점 앞 공백 제거
s = re.sub(r"\s+", " ", sentence)
s = re.sub(r"\s+([,\.!\?…])", r"\1", s)
return s.strip()

def split_sentences(text: str, split_on_newline: bool = False) -> List[str]:
"""
텍스트 → 문장 리스트
- 기본: 종결부호(. ! ? …) 기준
- split_on_newline=True: 종결부호가 없을 땐 \n도 문장 경계로 보조 분리
"""
if not text:
return []
t = _normalize_spaces_keep_newline(text)

# 문장 캡처
raw_parts = _SENT_CAPTURE_RE.findall(t)
parts: List[str] = []
for p in raw_parts:
p = p.strip()
if not p:
continue
if split_on_newline and ("\n" in p) and not re.search(_SENT_END, p):
# 종결부호 없는 조각은 개행으로 보조 분리
for seg in re.split(r"\n+", p):
seg = seg.strip()
if seg:
parts.append(_tidy(seg))
else:
parts.append(_tidy(p))
return parts
10 changes: 10 additions & 0 deletions app/services/tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import re
from typing import List

# 한글/영문/숫자는 묶고, 그 외 기호는 단독 토큰
_TOKEN_RE = re.compile(r"[가-힣]+|[A-Za-z]+(?:'[A-Za-z]+)?|[0-9]+|[^\sA-Za-z0-9가-힣]")

def tokenize(sentence: str) -> List[str]:
if not sentence:
return []
return [m.group(0) for m in _TOKEN_RE.finditer(sentence)]