Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ dependencies {
implementation 'org.springframework.boot:spring-boot-starter-data-elasticsearch'
implementation 'org.springframework.ai:spring-ai-openai-spring-boot-starter'
implementation 'org.seleniumhq.selenium:selenium-java:4.20.0'
implementation 'io.github.bonigarcia:webdrivermanager:5.8.0'
implementation 'com.microsoft.playwright:playwright:1.37.0'
implementation 'org.seleniumhq.selenium:selenium-devtools-v135:4.31.0'
implementation 'com.opencsv:opencsv:5.9'

Expand Down
32 changes: 27 additions & 5 deletions infra/local/dockerfile_backend
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,43 @@ COPY . .
RUN gradle bootJar --no-daemon

# 2단계: 실행 스테이지
FROM eclipse-temurin:21-jdk
FROM ubuntu:20.04

# vi 설치 (옵션)
RUN apt-get update && apt-get install -y vim && rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y wget gnupg2

RUN apt-get update && apt-get install -y locales \
&& locale-gen ko_KR.UTF-8 \
&& update-locale LANG=ko_KR.UTF-8

ENV LANG=ko_KR.UTF-8
ENV LANGUAGE=ko_KR:ko
ENV LC_ALL=ko_KR.UTF-8

RUN wget -qO - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add - \
&& echo "deb https://packages.adoptium.net/artifactory/deb focal main" | tee /etc/apt/sources.list.d/adoptium.list \
&& apt-get update \
&& apt-get install -y temurin-21-jdk

RUN apt-get install -y \
vim dos2unix curl \
libglib2.0-0 libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libdbus-1-3 \
libxcb1 libxkbcommon0 libatspi2.0-0 libx11-6 libxcomposite1 libxdamage1 libxext6 libxfixes3 \
libxrandr2 libgbm1 libpango-1.0-0 libcairo2 libasound2 \
&& rm -rf /var/lib/apt/lists/*

WORKDIR /app

# 빌드 스테이지에서 생성된 jar 파일 복사
COPY --from=builder /build/build/libs/*.jar app.jar
COPY script/backendSetting.sh /app/wait-for-elasticsearch.sh
COPY /infra/local/script/backendSetting.sh /app/wait-for-elasticsearch.sh

RUN dos2unix /app/wait-for-elasticsearch.sh
RUN chmod +x /app/wait-for-elasticsearch.sh

RUN chmod 777 /app/app.jar
ARG PROFILE
ENV PROFILE=${PROFILE}

EXPOSE 8080

ENTRYPOINT ["/bin/bash", "/app/wait-for-elasticsearch.sh"]
ENTRYPOINT ["/bin/bash", "/app/wait-for-elasticsearch.sh"]
2 changes: 1 addition & 1 deletion infra/local/dockerfile_nginx
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ COPY nginx_config/default.conf /etc/nginx/conf.d/default.conf
VOLUME ["/etc/letsencrypt", "/var/www/certbot"]

# 필요한 쉘 스크립트 복사
COPY script/nginxSetting.sh start-nginx.sh
COPY /infra/local/script/nginxSetting.sh start-nginx.sh
COPY nginx_config/config.conf /config.conf

# 쉘 스크립트에 실행 권한 부여
Expand Down
25 changes: 22 additions & 3 deletions infra/main_app/dockerfile_backend
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,29 @@ COPY . .
RUN gradle bootJar --no-daemon

# 2단계: 실행 스테이지
FROM eclipse-temurin:21-jdk
FROM ubuntu:20.04

# vi 설치 (옵션)
RUN apt-get update && apt-get install -y vim && rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y wget gnupg2

RUN apt-get update && apt-get install -y locales \
&& locale-gen ko_KR.UTF-8 \
&& update-locale LANG=ko_KR.UTF-8

ENV LANG=ko_KR.UTF-8
ENV LANGUAGE=ko_KR:ko
ENV LC_ALL=ko_KR.UTF-8

RUN wget -qO - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add - \
&& echo "deb https://packages.adoptium.net/artifactory/deb focal main" | tee /etc/apt/sources.list.d/adoptium.list \
&& apt-get update \
&& apt-get install -y temurin-21-jdk

RUN apt-get install -y \
vim dos2unix curl \
libglib2.0-0 libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libdbus-1-3 \
libxcb1 libxkbcommon0 libatspi2.0-0 libx11-6 libxcomposite1 libxdamage1 libxext6 libxfixes3 \
libxrandr2 libgbm1 libpango-1.0-0 libcairo2 libasound2 \
&& rm -rf /var/lib/apt/lists/*

WORKDIR /app

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.sangchu.preprocess.etl.controller;

import com.sangchu.preprocess.scheduler.PreProcessScheduler;
import lombok.RequiredArgsConstructor;
import org.springframework.batch.core.Job;
import org.springframework.batch.core.JobParameters;
Expand All @@ -20,6 +21,7 @@ public class MysqlBatchController {

private final JobLauncher jobLauncher;
private final Job mysqlJob;
private final PreProcessScheduler preProcessScheduler;

@PostMapping("/import/mysql")
public String runImportMysqlJob() throws JobInstanceAlreadyCompleteException, JobExecutionAlreadyRunningException, JobParametersInvalidException, JobRestartException {
Expand All @@ -31,4 +33,9 @@ public String runImportMysqlJob() throws JobInstanceAlreadyCompleteException, Jo

return "csv -> mysql 작업";
}

@PostMapping("/scheduler")
public void runScheduler() throws Exception {
preProcessScheduler.scheduler();
}
}
Original file line number Diff line number Diff line change
@@ -1,19 +1,11 @@
package com.sangchu.preprocess.etl.service;

import com.microsoft.playwright.*;
import com.sangchu.global.exception.custom.CustomException;
import com.sangchu.global.util.UtilFile;
import com.sangchu.global.util.statuscode.ApiStatus;
import io.github.bonigarcia.wdm.WebDriverManager;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.Alert;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.springframework.stereotype.Service;

import java.io.IOException;
Expand All @@ -23,11 +15,9 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.Duration;
import java.nio.file.StandardCopyOption;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.zip.ZipFile;

@Slf4j
Expand All @@ -45,41 +35,39 @@ public void crwalingCsvData() {
* 다운받은 압축 파일을 resources/data에 저장
*/
private void crwaling() {
WebDriverManager.chromedriver().setup();

Path resourcePath = Paths.get("src/main/resources/data").toAbsolutePath();
log.info("resourcePath = " + resourcePath);
log.info("resourcePath = {}", resourcePath);

UtilFile.resetDirectory(resourcePath);

Map<String, Object> chromePrefs = new HashMap<>();
chromePrefs.put("download.default_directory", resourcePath.toString());
chromePrefs.put("download.prompt_for_download", false);
chromePrefs.put("safebrowsing.enabled", true);

ChromeOptions options = new ChromeOptions();
options.setExperimentalOption("prefs", chromePrefs);
try (Playwright playwright = Playwright.create()) {
Browser browser = playwright.chromium().launch(new BrowserType.LaunchOptions()
.setHeadless(true));

WebDriver driver = new ChromeDriver(options);
BrowserContext context = browser.newContext(new Browser.NewContextOptions()
.setAcceptDownloads(true));

try {
driver.get("https://www.data.go.kr/data/15083033/fileData.do#/layer_data_infomation");
Page page = context.newPage();
page.navigate("https://www.data.go.kr/data/15083033/fileData.do#/layer_data_infomation");
page.onDialog(Dialog::accept);

WebElement downloadBtn = driver.findElement(By.xpath("//a[contains(@onclick, \"fn_fileDataDown('15083033'\")]"));
downloadBtn.click();
Thread.sleep(3000);
Download download = page.waitForDownload(() -> {
try {
page.click("xpath=//a[contains(@onclick, \"fn_fileDataDown('15083033'\")]");
} catch (PlaywrightException e) {
throw new CustomException(ApiStatus._INTERNAL_SERVER_ERROR);
}
});

WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(10));
Alert alert = wait.until(ExpectedConditions.alertIsPresent());
alert.accept();
Path tmpDownloadedFile = download.path();

waitForDownloadToComplete(30);
Path targetFile = resourcePath.resolve(download.suggestedFilename());
Files.copy(tmpDownloadedFile, targetFile, StandardCopyOption.REPLACE_EXISTING);

browser.close();
} catch (Exception e) {
log.error("크롤링 실패", e);
throw new CustomException(ApiStatus._FILE_DOWNLOAD_FAIL);
} finally {
driver.quit();
}
}

Expand Down Expand Up @@ -111,6 +99,7 @@ public void fileUnZip() {
Path downloadDir = resourcePath.resolve("data");

List<Charset> charsets = Arrays.asList(
Charset.forName("EUC-KR"),
Charset.forName("MS949"),
Charset.forName("CP949"),
StandardCharsets.UTF_8,
Expand All @@ -119,7 +108,7 @@ public void fileUnZip() {

try {
Files.walk(downloadDir)
.filter(path -> Files.isRegularFile(path) && path.toString().endsWith(".zip"))
.filter(path -> Files.isRegularFile(path) && isZipFile(path) && !path.toString().endsWith(".txt"))
.forEach(zipFilePath -> {
try {
// 압축 파일 내 CSV 파일명 캐릭터셋 검사
Expand All @@ -134,7 +123,11 @@ public void fileUnZip() {
ZipFile zipFile = new ZipFile(zipFilePath.toFile(), extractedCharset);

zipFile.entries().asIterator().forEachRemaining(zipEntry -> {
Path outputPath = downloadDir.resolve(zipEntry.getName());
String entryName = zipEntry.getName();

if (entryName.endsWith(".txt")) return;

Path outputPath = downloadDir.resolve(entryName);

try {
if (zipEntry.isDirectory()) {
Expand Down Expand Up @@ -166,4 +159,16 @@ public void fileUnZip() {
throw new CustomException(ApiStatus._FILE_UNZIP_FAILED);
}
}

private boolean isZipFile(Path path) {
try (InputStream is = Files.newInputStream(path)) {
byte[] signature = new byte[4];
if (is.read(signature) == 4) {
return signature[0] == 'P' && signature[1] == 'K' && signature[2] == 3 && signature[3] == 4;
}
} catch (IOException e) {
log.warn("ZIP 파일 여부 확인 실패: {}", path.getFileName(), e);
}
return false;
}
}