Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 97 additions & 74 deletions app/algorithms/PrimerDesigner.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,11 @@ def generate_candidates(
if not (gc_range[0] <= gc_content <= gc_range[1]):
continue

# 3. 3' 말단 안정성 및 GC Clamp
dg3 = sum(NN_PARAMS.get(s[-5:][j : j + 2], (0, 0))[0] for j in range(4))
# 3. 3' 말단 안정성 및 GC Clamp (올바른 깁스 자유에너지 계산으로 수정)
dh3 = sum(NN_PARAMS.get(s[-5:][j : j + 2], (0, 0))[0] for j in range(4))
ds3 = sum(NN_PARAMS.get(s[-5:][j : j + 2], (0, 0))[1] for j in range(4))
dg3 = dh3 - (310.15 * (ds3 / 1000.0)) # 37°C(310.15K) 기준 dG 계산

if dg3 <= -10.0:
continue
if gc_clamp and s[-1] not in "GC":
Expand All @@ -126,26 +129,40 @@ def generate_candidates(
# 좌표 변환 및 매핑 유틸리티 추가
# ==========================================
def locate_template_in_genome(self, template_seq: str) -> Optional[Dict]:
"""Stage 1: 입력된 템플릿 서열의 1-based 게놈 좌표 탐색"""
"""Stage 1: 입력된 템플릿 서열의 1-based 게놈 좌표 탐색 (메모리 최적화 - 청크 스캔)"""
chunk_size = 5_000_000 # 5MB 단위로 쪼개서 로드 (OOM 방지)
overlap = len(template_seq) # 청크 경계선에 걸친 서열을 찾기 위한 오버랩

for ref in self.genome.references:
full_seq = self.genome.fetch(ref)
pos = full_seq.find(template_seq)
if pos != -1:
return {
"chrom": ref,
"genomic_start": pos + 1, # 1-based 변환
"strand": "+",
"template_length": len(template_seq)
}
rev_seq = reverse_complement(template_seq)
pos = full_seq.find(rev_seq)
if pos != -1:
return {
"chrom": ref,
"genomic_start": pos + 1, # 1-based 변환
"strand": "-",
"template_length": len(template_seq)
}
ref_len = self.genome.get_reference_length(ref)

for start_idx in range(0, ref_len, chunk_size - overlap):
end_idx = min(start_idx + chunk_size, ref_len)

try:
# 염색체 전체가 아닌 5MB 구간만 읽어옵니다.
chunk_seq = self.genome.fetch(ref, start_idx, end_idx)
except Exception:
continue

pos = chunk_seq.find(template_seq)
if pos != -1:
return {
"chrom": ref,
"genomic_start": start_idx + pos + 1,
"strand": "+",
"template_length": len(template_seq)
}

rev_seq = reverse_complement(template_seq)
pos = chunk_seq.find(rev_seq)
if pos != -1:
return {
"chrom": ref,
"genomic_start": start_idx + pos + 1,
"strand": "-",
"template_length": len(template_seq)
}
return None

def map_to_genomic_coords(self, primer: Dict, template_info: Dict) -> Dict:
Expand Down Expand Up @@ -240,13 +257,10 @@ def filter_specific_primers(
max_hits=50,
mismatch_cutoff=2,
) -> List[Dict]:
"""
Stage 2.3: 게놈 전체 특이성 일괄 검사 (I/O 병목 극적 최적화)
기존 specificity_check를 대체하며, 후보군 전체(List)를 받아 염색체 호출을 24회로 최소화합니다.
"""
"""Stage 2.3: 게놈 전체 특이성 일괄 검사 (메모리 최적화 - 청크 스캔)"""
valid_primers = []

# 1. Mispriming Library (반복 서열) 필터링 - SQLite 쿼리 사전 수행
# 1. Mispriming Library (반복 서열) 필터링
if mispriming_library:
for p in primers:
self.cur.execute(
Expand All @@ -258,70 +272,79 @@ def filter_specific_primers(
else:
valid_primers = primers.copy()

# 2. 상태 추적용 딕셔너리 구성
primer_pool = {p["seq"]: p for p in valid_primers}
hit_counts = {p["seq"]: 0 for p in valid_primers}

# 3. 루프 역전: 염색체를 한 번만 불러오고, 메모리 상에서 남은 모든 프라이머 스캔
# 5MB 청크 세팅 (오버랩은 프라이머 최대 길이)
chunk_size = 5_000_000
overlap = max(len(p["seq"]) for p in valid_primers) if valid_primers else 30

for ref in self.genome.references:
if not primer_pool:
break # 모든 프라이머가 탈락했다면 스캔 즉시 종료

full_seq = self.genome.fetch(ref)
break
ref_len = self.genome.get_reference_length(ref)

for p_seq in list(primer_pool.keys()):
for search_seq in [p_seq, reverse_complement(p_seq)]:
pos = full_seq.find(search_seq)
# 5MB씩 슬라이딩 스캔
for start_idx in range(0, ref_len, chunk_size - overlap):
if not primer_pool:
break

while pos != -1:
pos_1based = pos + 1
end_1based = pos + len(search_seq)

# 의도된 타겟 구간 무시
if ref == target_chrom and target_start <= pos_1based <= target_end:
pos = full_seq.find(search_seq, pos + 1)
continue

# DB 변이체 필터링 (일치 타겟 발견 시에만)
if splice_variant_handling:
self.cur.execute(
"SELECT transcript_id FROM exon WHERE chrom=? AND start <= ? AND end >= ?",
(ref, pos_1based, end_1based),
)
if self.cur.fetchone():
pos = full_seq.find(search_seq, pos + 1)
continue
try:
end_idx = min(start_idx + chunk_size, ref_len)
chunk_seq = self.genome.fetch(ref, start_idx, end_idx)
except Exception:
continue

if snp_exclusion:
self.cur.execute(
"SELECT COUNT(*) FROM snp WHERE chrom=? AND pos BETWEEN ? AND ?",
(ref, pos_1based, end_1based),
)
if self.cur.fetchone()[0] > 0:
pos = full_seq.find(search_seq, pos + 1)
for p_seq in list(primer_pool.keys()):
for search_seq in [p_seq, reverse_complement(p_seq)]:
pos = chunk_seq.find(search_seq)

while pos != -1:
# 로컬 chunk 안에서의 pos를 게놈 절대 좌표(1-based)로 변환
pos_1based = start_idx + pos + 1
end_1based = start_idx + pos + len(search_seq)

if ref == target_chrom and target_start <= pos_1based <= target_end:
pos = chunk_seq.find(search_seq, pos + 1)
continue

# 3' 말단 미스매치 정밀 검사
off_target = full_seq[pos : pos + len(p_seq)]
mm = needleman_wunsch_mismatch(p_seq[-10:], off_target[-10:])

if mm < mismatch_cutoff:
# 치명적 Off-target 발견 시 즉시 탈락
del primer_pool[p_seq]
break
if splice_variant_handling:
self.cur.execute(
"SELECT transcript_id FROM exon WHERE chrom=? AND start <= ? AND end >= ?",
(ref, pos_1based, end_1based),
)
if self.cur.fetchone():
pos = chunk_seq.find(search_seq, pos + 1)
continue

if snp_exclusion:
self.cur.execute(
"SELECT COUNT(*) FROM snp WHERE chrom=? AND pos BETWEEN ? AND ?",
(ref, pos_1based, end_1based),
)
if self.cur.fetchone()[0] > 0:
pos = chunk_seq.find(search_seq, pos + 1)
continue

# 3' 말단 미스매치 정밀 검사
off_target = chunk_seq[pos : pos + len(p_seq)]
mm = needleman_wunsch_mismatch(p_seq[-10:], off_target[-10:])

hit_counts[p_seq] += 1
if hit_counts[p_seq] > max_hits:
del primer_pool[p_seq]
break
if mm < mismatch_cutoff:
del primer_pool[p_seq]
break

hit_counts[p_seq] += 1
if hit_counts[p_seq] > max_hits:
del primer_pool[p_seq]
break

pos = full_seq.find(search_seq, pos + 1)
pos = chunk_seq.find(search_seq, pos + 1)

# 현재 프라이머가 탈락했다면, Reverse Complement 검사 등 내부 루프 완전히 중단
if p_seq not in primer_pool:
break

# 안전성이 검증된 프라이머들만 반환
return list(primer_pool.values())

def pair_primers(
Expand Down
9 changes: 9 additions & 0 deletions app/api/v1/endpoints/design.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,15 @@ async def design(request: PrimerDesignRequest) -> PrimerDesignResponse:
if template_info:
candidates = _filter_candidates_by_template(designer, request, candidates, template_info)

# [추가] 품질 점수(Penalty) 기준 상위 50개 필터링
opt_tm = request.basic.primerTm.opt
for cand in candidates:
# 페널티 = |Tm오차| + |말단안정성오차|
cand["penalty"] = abs(cand.get("tm", 0) - opt_tm) + abs(cand.get("dg3", 0) + 8.0)

candidates.sort(key=lambda x: x["penalty"])
candidates = candidates[:50] # 최정예 50개만 선정

return _build_response(request, template_info, candidates, started)
except HTTPException:
raise
Expand Down