diff --git a/build.gradle b/build.gradle index d69ff75..676379a 100644 --- a/build.gradle +++ b/build.gradle @@ -38,7 +38,7 @@ dependencies { implementation 'org.springframework.boot:spring-boot-starter-data-elasticsearch' implementation 'org.springframework.ai:spring-ai-openai-spring-boot-starter' implementation 'org.seleniumhq.selenium:selenium-java:4.20.0' - implementation 'io.github.bonigarcia:webdrivermanager:5.8.0' + implementation 'com.microsoft.playwright:playwright:1.37.0' implementation 'org.seleniumhq.selenium:selenium-devtools-v135:4.31.0' implementation 'com.opencsv:opencsv:5.9' diff --git a/infra/local/dockerfile_backend b/infra/local/dockerfile_backend index 6526d52..310ccb7 100644 --- a/infra/local/dockerfile_backend +++ b/infra/local/dockerfile_backend @@ -9,16 +9,38 @@ COPY . . RUN gradle bootJar --no-daemon # 2단계: 실행 스테이지 -FROM eclipse-temurin:21-jdk +FROM ubuntu:20.04 -# vi 설치 (옵션) -RUN apt-get update && apt-get install -y vim && rm -rf /var/lib/apt/lists/* +RUN apt-get update && apt-get install -y wget gnupg2 + +RUN apt-get update && apt-get install -y locales \ + && locale-gen ko_KR.UTF-8 \ + && update-locale LANG=ko_KR.UTF-8 + +ENV LANG=ko_KR.UTF-8 +ENV LANGUAGE=ko_KR:ko +ENV LC_ALL=ko_KR.UTF-8 + +RUN wget -qO - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add - \ + && echo "deb https://packages.adoptium.net/artifactory/deb focal main" | tee /etc/apt/sources.list.d/adoptium.list \ + && apt-get update \ + && apt-get install -y temurin-21-jdk + +RUN apt-get install -y \ + vim dos2unix curl \ + libglib2.0-0 libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libdbus-1-3 \ + libxcb1 libxkbcommon0 libatspi2.0-0 libx11-6 libxcomposite1 libxdamage1 libxext6 libxfixes3 \ + libxrandr2 libgbm1 libpango-1.0-0 libcairo2 libasound2 \ + && rm -rf /var/lib/apt/lists/* WORKDIR /app # 빌드 스테이지에서 생성된 jar 파일 복사 COPY --from=builder /build/build/libs/*.jar app.jar -COPY script/backendSetting.sh /app/wait-for-elasticsearch.sh +COPY /infra/local/script/backendSetting.sh /app/wait-for-elasticsearch.sh + +RUN dos2unix /app/wait-for-elasticsearch.sh +RUN chmod +x /app/wait-for-elasticsearch.sh RUN chmod 777 /app/app.jar ARG PROFILE @@ -26,4 +48,4 @@ ENV PROFILE=${PROFILE} EXPOSE 8080 -ENTRYPOINT ["/bin/bash", "/app/wait-for-elasticsearch.sh"] +ENTRYPOINT ["/bin/bash", "/app/wait-for-elasticsearch.sh"] \ No newline at end of file diff --git a/infra/local/dockerfile_nginx b/infra/local/dockerfile_nginx index 7a1a542..a8a192d 100644 --- a/infra/local/dockerfile_nginx +++ b/infra/local/dockerfile_nginx @@ -10,7 +10,7 @@ COPY nginx_config/default.conf /etc/nginx/conf.d/default.conf VOLUME ["/etc/letsencrypt", "/var/www/certbot"] # 필요한 쉘 스크립트 복사 -COPY script/nginxSetting.sh start-nginx.sh +COPY /infra/local/script/nginxSetting.sh start-nginx.sh COPY nginx_config/config.conf /config.conf # 쉘 스크립트에 실행 권한 부여 diff --git a/infra/main_app/dockerfile_backend b/infra/main_app/dockerfile_backend index 52471e5..77b5352 100644 --- a/infra/main_app/dockerfile_backend +++ b/infra/main_app/dockerfile_backend @@ -9,10 +9,29 @@ COPY . . RUN gradle bootJar --no-daemon # 2단계: 실행 스테이지 -FROM eclipse-temurin:21-jdk +FROM ubuntu:20.04 -# vi 설치 (옵션) -RUN apt-get update && apt-get install -y vim && rm -rf /var/lib/apt/lists/* +RUN apt-get update && apt-get install -y wget gnupg2 + +RUN apt-get update && apt-get install -y locales \ + && locale-gen ko_KR.UTF-8 \ + && update-locale LANG=ko_KR.UTF-8 + +ENV LANG=ko_KR.UTF-8 +ENV LANGUAGE=ko_KR:ko +ENV LC_ALL=ko_KR.UTF-8 + +RUN wget -qO - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add - \ + && echo "deb https://packages.adoptium.net/artifactory/deb focal main" | tee /etc/apt/sources.list.d/adoptium.list \ + && apt-get update \ + && apt-get install -y temurin-21-jdk + +RUN apt-get install -y \ + vim dos2unix curl \ + libglib2.0-0 libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libdbus-1-3 \ + libxcb1 libxkbcommon0 libatspi2.0-0 libx11-6 libxcomposite1 libxdamage1 libxext6 libxfixes3 \ + libxrandr2 libgbm1 libpango-1.0-0 libcairo2 libasound2 \ + && rm -rf /var/lib/apt/lists/* WORKDIR /app diff --git a/infra/main_app/nginx_config/config.conf b/infra/main_app/nginx_config/config.conf index 8c3b4b1..ec2d940 100644 --- a/infra/main_app/nginx_config/config.conf +++ b/infra/main_app/nginx_config/config.conf @@ -18,16 +18,31 @@ server { proxy_read_timeout 900s; } - location /embed { - proxy_pass http://embedding:5000/embed; - proxy_http_version 1.1; - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection "upgrade"; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - } + location /embed { + proxy_pass http://embedding:5000/embed; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + proxy_read_timeout 900s; + } + + location /embed/ { + proxy_pass http://embedding:5000/embed/; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + proxy_read_timeout 900s; + } location / { proxy_pass http://frontend:80/; diff --git a/src/main/java/com/sangchu/preprocess/etl/controller/MysqlBatchController.java b/src/main/java/com/sangchu/preprocess/etl/controller/MysqlBatchController.java index 157b25f..b806b1c 100644 --- a/src/main/java/com/sangchu/preprocess/etl/controller/MysqlBatchController.java +++ b/src/main/java/com/sangchu/preprocess/etl/controller/MysqlBatchController.java @@ -1,5 +1,6 @@ package com.sangchu.preprocess.etl.controller; +import com.sangchu.preprocess.scheduler.PreProcessScheduler; import lombok.RequiredArgsConstructor; import org.springframework.batch.core.Job; import org.springframework.batch.core.JobParameters; @@ -20,6 +21,7 @@ public class MysqlBatchController { private final JobLauncher jobLauncher; private final Job mysqlJob; + private final PreProcessScheduler preProcessScheduler; @PostMapping("/import/mysql") public String runImportMysqlJob() throws JobInstanceAlreadyCompleteException, JobExecutionAlreadyRunningException, JobParametersInvalidException, JobRestartException { @@ -31,4 +33,9 @@ public String runImportMysqlJob() throws JobInstanceAlreadyCompleteException, Jo return "csv -> mysql 작업"; } + + @PostMapping("/scheduler") + public void runScheduler() throws Exception { + preProcessScheduler.scheduler(); + } } \ No newline at end of file diff --git a/src/main/java/com/sangchu/preprocess/etl/service/CrawlerService.java b/src/main/java/com/sangchu/preprocess/etl/service/CrawlerService.java index b649c3a..52e87fc 100644 --- a/src/main/java/com/sangchu/preprocess/etl/service/CrawlerService.java +++ b/src/main/java/com/sangchu/preprocess/etl/service/CrawlerService.java @@ -1,19 +1,11 @@ package com.sangchu.preprocess.etl.service; +import com.microsoft.playwright.*; import com.sangchu.global.exception.custom.CustomException; import com.sangchu.global.util.UtilFile; import com.sangchu.global.util.statuscode.ApiStatus; -import io.github.bonigarcia.wdm.WebDriverManager; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.openqa.selenium.Alert; -import org.openqa.selenium.By; -import org.openqa.selenium.WebDriver; -import org.openqa.selenium.WebElement; -import org.openqa.selenium.chrome.ChromeDriver; -import org.openqa.selenium.chrome.ChromeOptions; -import org.openqa.selenium.support.ui.ExpectedConditions; -import org.openqa.selenium.support.ui.WebDriverWait; import org.springframework.stereotype.Service; import java.io.IOException; @@ -23,11 +15,9 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.time.Duration; +import java.nio.file.StandardCopyOption; import java.util.Arrays; -import java.util.HashMap; import java.util.List; -import java.util.Map; import java.util.zip.ZipFile; @Slf4j @@ -45,41 +35,39 @@ public void crwalingCsvData() { * 다운받은 압축 파일을 resources/data에 저장 */ private void crwaling() { - WebDriverManager.chromedriver().setup(); - Path resourcePath = Paths.get("src/main/resources/data").toAbsolutePath(); - log.info("resourcePath = " + resourcePath); + log.info("resourcePath = {}", resourcePath); UtilFile.resetDirectory(resourcePath); - Map chromePrefs = new HashMap<>(); - chromePrefs.put("download.default_directory", resourcePath.toString()); - chromePrefs.put("download.prompt_for_download", false); - chromePrefs.put("safebrowsing.enabled", true); - - ChromeOptions options = new ChromeOptions(); - options.setExperimentalOption("prefs", chromePrefs); + try (Playwright playwright = Playwright.create()) { + Browser browser = playwright.chromium().launch(new BrowserType.LaunchOptions() + .setHeadless(true)); - WebDriver driver = new ChromeDriver(options); + BrowserContext context = browser.newContext(new Browser.NewContextOptions() + .setAcceptDownloads(true)); - try { - driver.get("https://www.data.go.kr/data/15083033/fileData.do#/layer_data_infomation"); + Page page = context.newPage(); + page.navigate("https://www.data.go.kr/data/15083033/fileData.do#/layer_data_infomation"); + page.onDialog(Dialog::accept); - WebElement downloadBtn = driver.findElement(By.xpath("//a[contains(@onclick, \"fn_fileDataDown('15083033'\")]")); - downloadBtn.click(); - Thread.sleep(3000); + Download download = page.waitForDownload(() -> { + try { + page.click("xpath=//a[contains(@onclick, \"fn_fileDataDown('15083033'\")]"); + } catch (PlaywrightException e) { + throw new CustomException(ApiStatus._INTERNAL_SERVER_ERROR); + } + }); - WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(10)); - Alert alert = wait.until(ExpectedConditions.alertIsPresent()); - alert.accept(); + Path tmpDownloadedFile = download.path(); - waitForDownloadToComplete(30); + Path targetFile = resourcePath.resolve(download.suggestedFilename()); + Files.copy(tmpDownloadedFile, targetFile, StandardCopyOption.REPLACE_EXISTING); + browser.close(); } catch (Exception e) { log.error("크롤링 실패", e); throw new CustomException(ApiStatus._FILE_DOWNLOAD_FAIL); - } finally { - driver.quit(); } } @@ -111,6 +99,7 @@ public void fileUnZip() { Path downloadDir = resourcePath.resolve("data"); List charsets = Arrays.asList( + Charset.forName("EUC-KR"), Charset.forName("MS949"), Charset.forName("CP949"), StandardCharsets.UTF_8, @@ -119,7 +108,7 @@ public void fileUnZip() { try { Files.walk(downloadDir) - .filter(path -> Files.isRegularFile(path) && path.toString().endsWith(".zip")) + .filter(path -> Files.isRegularFile(path) && isZipFile(path) && !path.toString().endsWith(".txt")) .forEach(zipFilePath -> { try { // 압축 파일 내 CSV 파일명 캐릭터셋 검사 @@ -134,7 +123,11 @@ public void fileUnZip() { ZipFile zipFile = new ZipFile(zipFilePath.toFile(), extractedCharset); zipFile.entries().asIterator().forEachRemaining(zipEntry -> { - Path outputPath = downloadDir.resolve(zipEntry.getName()); + String entryName = zipEntry.getName(); + + if (entryName.endsWith(".txt")) return; + + Path outputPath = downloadDir.resolve(entryName); try { if (zipEntry.isDirectory()) { @@ -166,4 +159,16 @@ public void fileUnZip() { throw new CustomException(ApiStatus._FILE_UNZIP_FAILED); } } + + private boolean isZipFile(Path path) { + try (InputStream is = Files.newInputStream(path)) { + byte[] signature = new byte[4]; + if (is.read(signature) == 4) { + return signature[0] == 'P' && signature[1] == 'K' && signature[2] == 3 && signature[3] == 4; + } + } catch (IOException e) { + log.warn("ZIP 파일 여부 확인 실패: {}", path.getFileName(), e); + } + return false; + } } \ No newline at end of file