Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ dependencies {
implementation 'org.springframework.boot:spring-boot-starter-data-elasticsearch'
implementation 'org.springframework.ai:spring-ai-openai-spring-boot-starter'
implementation 'org.seleniumhq.selenium:selenium-java:4.20.0'
implementation 'io.github.bonigarcia:webdrivermanager:5.8.0'
implementation 'com.microsoft.playwright:playwright:1.37.0'
implementation 'org.seleniumhq.selenium:selenium-devtools-v135:4.31.0'
implementation 'com.opencsv:opencsv:5.9'

Expand Down
32 changes: 27 additions & 5 deletions infra/local/dockerfile_backend
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,43 @@ COPY . .
RUN gradle bootJar --no-daemon

# 2단계: 실행 스테이지
FROM eclipse-temurin:21-jdk
FROM ubuntu:20.04

# vi 설치 (옵션)
RUN apt-get update && apt-get install -y vim && rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y wget gnupg2

RUN apt-get update && apt-get install -y locales \
&& locale-gen ko_KR.UTF-8 \
&& update-locale LANG=ko_KR.UTF-8

ENV LANG=ko_KR.UTF-8
ENV LANGUAGE=ko_KR:ko
ENV LC_ALL=ko_KR.UTF-8

RUN wget -qO - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add - \
&& echo "deb https://packages.adoptium.net/artifactory/deb focal main" | tee /etc/apt/sources.list.d/adoptium.list \
&& apt-get update \
&& apt-get install -y temurin-21-jdk

RUN apt-get install -y \
vim dos2unix curl \
libglib2.0-0 libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libdbus-1-3 \
libxcb1 libxkbcommon0 libatspi2.0-0 libx11-6 libxcomposite1 libxdamage1 libxext6 libxfixes3 \
libxrandr2 libgbm1 libpango-1.0-0 libcairo2 libasound2 \
&& rm -rf /var/lib/apt/lists/*

WORKDIR /app

# 빌드 스테이지에서 생성된 jar 파일 복사
COPY --from=builder /build/build/libs/*.jar app.jar
COPY script/backendSetting.sh /app/wait-for-elasticsearch.sh
COPY /infra/local/script/backendSetting.sh /app/wait-for-elasticsearch.sh

RUN dos2unix /app/wait-for-elasticsearch.sh
RUN chmod +x /app/wait-for-elasticsearch.sh

RUN chmod 777 /app/app.jar
ARG PROFILE
ENV PROFILE=${PROFILE}

EXPOSE 8080

ENTRYPOINT ["/bin/bash", "/app/wait-for-elasticsearch.sh"]
ENTRYPOINT ["/bin/bash", "/app/wait-for-elasticsearch.sh"]
2 changes: 1 addition & 1 deletion infra/local/dockerfile_nginx
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ COPY nginx_config/default.conf /etc/nginx/conf.d/default.conf
VOLUME ["/etc/letsencrypt", "/var/www/certbot"]

# 필요한 쉘 스크립트 복사
COPY script/nginxSetting.sh start-nginx.sh
COPY /infra/local/script/nginxSetting.sh start-nginx.sh
COPY nginx_config/config.conf /config.conf

# 쉘 스크립트에 실행 권한 부여
Expand Down
25 changes: 22 additions & 3 deletions infra/main_app/dockerfile_backend
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,29 @@ COPY . .
RUN gradle bootJar --no-daemon

# 2단계: 실행 스테이지
FROM eclipse-temurin:21-jdk
FROM ubuntu:20.04

# vi 설치 (옵션)
RUN apt-get update && apt-get install -y vim && rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y wget gnupg2

RUN apt-get update && apt-get install -y locales \
&& locale-gen ko_KR.UTF-8 \
&& update-locale LANG=ko_KR.UTF-8

ENV LANG=ko_KR.UTF-8
ENV LANGUAGE=ko_KR:ko
ENV LC_ALL=ko_KR.UTF-8

RUN wget -qO - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add - \
&& echo "deb https://packages.adoptium.net/artifactory/deb focal main" | tee /etc/apt/sources.list.d/adoptium.list \
&& apt-get update \
&& apt-get install -y temurin-21-jdk

RUN apt-get install -y \
vim dos2unix curl \
libglib2.0-0 libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libdbus-1-3 \
libxcb1 libxkbcommon0 libatspi2.0-0 libx11-6 libxcomposite1 libxdamage1 libxext6 libxfixes3 \
libxrandr2 libgbm1 libpango-1.0-0 libcairo2 libasound2 \
&& rm -rf /var/lib/apt/lists/*

WORKDIR /app

Expand Down
35 changes: 25 additions & 10 deletions infra/main_app/nginx_config/config.conf
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,31 @@ server {
proxy_read_timeout 900s;
}

location /embed {
proxy_pass http://embedding:5000/embed;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
location /embed {
proxy_pass http://embedding:5000/embed;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;

proxy_read_timeout 900s;
}

location /embed/ {
proxy_pass http://embedding:5000/embed/;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;

proxy_read_timeout 900s;
}

location / {
proxy_pass http://frontend:80/;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.sangchu.preprocess.etl.controller;

import com.sangchu.preprocess.scheduler.PreProcessScheduler;
import lombok.RequiredArgsConstructor;
import org.springframework.batch.core.Job;
import org.springframework.batch.core.JobParameters;
Expand All @@ -20,6 +21,7 @@ public class MysqlBatchController {

private final JobLauncher jobLauncher;
private final Job mysqlJob;
private final PreProcessScheduler preProcessScheduler;

@PostMapping("/import/mysql")
public String runImportMysqlJob() throws JobInstanceAlreadyCompleteException, JobExecutionAlreadyRunningException, JobParametersInvalidException, JobRestartException {
Expand All @@ -31,4 +33,9 @@ public String runImportMysqlJob() throws JobInstanceAlreadyCompleteException, Jo

return "csv -> mysql 작업";
}

@PostMapping("/scheduler")
public void runScheduler() throws Exception {
preProcessScheduler.scheduler();
}
}
Original file line number Diff line number Diff line change
@@ -1,19 +1,11 @@
package com.sangchu.preprocess.etl.service;

import com.microsoft.playwright.*;
import com.sangchu.global.exception.custom.CustomException;
import com.sangchu.global.util.UtilFile;
import com.sangchu.global.util.statuscode.ApiStatus;
import io.github.bonigarcia.wdm.WebDriverManager;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.Alert;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.springframework.stereotype.Service;

import java.io.IOException;
Expand All @@ -23,11 +15,9 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.Duration;
import java.nio.file.StandardCopyOption;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.zip.ZipFile;

@Slf4j
Expand All @@ -45,41 +35,39 @@ public void crwalingCsvData() {
* 다운받은 압축 파일을 resources/data에 저장
*/
private void crwaling() {
WebDriverManager.chromedriver().setup();

Path resourcePath = Paths.get("src/main/resources/data").toAbsolutePath();
log.info("resourcePath = " + resourcePath);
log.info("resourcePath = {}", resourcePath);

UtilFile.resetDirectory(resourcePath);

Map<String, Object> chromePrefs = new HashMap<>();
chromePrefs.put("download.default_directory", resourcePath.toString());
chromePrefs.put("download.prompt_for_download", false);
chromePrefs.put("safebrowsing.enabled", true);

ChromeOptions options = new ChromeOptions();
options.setExperimentalOption("prefs", chromePrefs);
try (Playwright playwright = Playwright.create()) {
Browser browser = playwright.chromium().launch(new BrowserType.LaunchOptions()
.setHeadless(true));

WebDriver driver = new ChromeDriver(options);
BrowserContext context = browser.newContext(new Browser.NewContextOptions()
.setAcceptDownloads(true));

try {
driver.get("https://www.data.go.kr/data/15083033/fileData.do#/layer_data_infomation");
Page page = context.newPage();
page.navigate("https://www.data.go.kr/data/15083033/fileData.do#/layer_data_infomation");
page.onDialog(Dialog::accept);

WebElement downloadBtn = driver.findElement(By.xpath("//a[contains(@onclick, \"fn_fileDataDown('15083033'\")]"));
downloadBtn.click();
Thread.sleep(3000);
Download download = page.waitForDownload(() -> {
try {
page.click("xpath=//a[contains(@onclick, \"fn_fileDataDown('15083033'\")]");
} catch (PlaywrightException e) {
throw new CustomException(ApiStatus._INTERNAL_SERVER_ERROR);
}
});

WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(10));
Alert alert = wait.until(ExpectedConditions.alertIsPresent());
alert.accept();
Path tmpDownloadedFile = download.path();

waitForDownloadToComplete(30);
Path targetFile = resourcePath.resolve(download.suggestedFilename());
Files.copy(tmpDownloadedFile, targetFile, StandardCopyOption.REPLACE_EXISTING);

browser.close();
} catch (Exception e) {
log.error("크롤링 실패", e);
throw new CustomException(ApiStatus._FILE_DOWNLOAD_FAIL);
} finally {
driver.quit();
}
}

Expand Down Expand Up @@ -111,6 +99,7 @@ public void fileUnZip() {
Path downloadDir = resourcePath.resolve("data");

List<Charset> charsets = Arrays.asList(
Charset.forName("EUC-KR"),
Charset.forName("MS949"),
Charset.forName("CP949"),
StandardCharsets.UTF_8,
Expand All @@ -119,7 +108,7 @@ public void fileUnZip() {

try {
Files.walk(downloadDir)
.filter(path -> Files.isRegularFile(path) && path.toString().endsWith(".zip"))
.filter(path -> Files.isRegularFile(path) && isZipFile(path) && !path.toString().endsWith(".txt"))
.forEach(zipFilePath -> {
try {
// 압축 파일 내 CSV 파일명 캐릭터셋 검사
Expand All @@ -134,7 +123,11 @@ public void fileUnZip() {
ZipFile zipFile = new ZipFile(zipFilePath.toFile(), extractedCharset);

zipFile.entries().asIterator().forEachRemaining(zipEntry -> {
Path outputPath = downloadDir.resolve(zipEntry.getName());
String entryName = zipEntry.getName();

if (entryName.endsWith(".txt")) return;

Path outputPath = downloadDir.resolve(entryName);

try {
if (zipEntry.isDirectory()) {
Expand Down Expand Up @@ -166,4 +159,16 @@ public void fileUnZip() {
throw new CustomException(ApiStatus._FILE_UNZIP_FAILED);
}
}

private boolean isZipFile(Path path) {
try (InputStream is = Files.newInputStream(path)) {
byte[] signature = new byte[4];
if (is.read(signature) == 4) {
return signature[0] == 'P' && signature[1] == 'K' && signature[2] == 3 && signature[3] == 4;
}
} catch (IOException e) {
log.warn("ZIP 파일 여부 확인 실패: {}", path.getFileName(), e);
}
return false;
}
}