자바 스프링부트 웹 크롤링 구현

자바 스프링부트 웹 크롤링 구현Java2025. 4. 15. 15:45@위피M

Table of Contents

1. 필요한 의존성 (build.gradle 또는 pom.xml에 추가)

dependencies {
    implementation 'org.springframework.boot:spring-boot-starter-web'
    implementation 'org.jsoup:jsoup:1.15.3'            // HTML 파싱
    implementation 'org.seleniumhq.selenium:selenium-java:4.8.1'  // 동적 웹사이트 크롤링
    implementation 'io.github.bonigarcia:webdrivermanager:5.3.2'  // WebDriver 관리
    implementation 'org.projectlombok:lombok'          // 코드 간소화

    testImplementation 'org.springframework.boot:spring-boot-starter-test'
}

2. 크롤링할 사이트 정보를 담을 모델 클래스

package com.example.webcrawler.model;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;

@Data
@NoArgsConstructor
@AllArgsConstructor
public class CrawledData {
    private String url;
    private String title;
    private String content;
    private String timestamp;
}

3. 크롤링 서비스 인터페이스

package com.example.webcrawler.service;

import com.example.webcrawler.model.CrawledData;
import java.util.List;

public interface CrawlerService {
    List<CrawledData> crawlWebsite(String url);
}

4. JSoup을 이용한 정적 웹사이트 크롤링 구현

package com.example.webcrawler.service.impl;

import com.example.webcrawler.model.CrawledData;
import com.example.webcrawler.service.CrawlerService;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Service;

import java.io.IOException;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;

@Service
public class JsoupCrawlerService implements CrawlerService {

    @Override
    public List<CrawledData> crawlWebsite(String url) {
        List<CrawledData> result = new ArrayList<>();

        try {
            // 웹 페이지 연결 및 HTML 가져오기
            Document document = Jsoup.connect(url)
                    .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36")
                    .timeout(10000)
                    .get();

            // 페이지 제목
            String title = document.title();

            // 본문 내용 (예: 모든 p 태그)
            Elements paragraphs = document.select("p");
            StringBuilder content = new StringBuilder();
            paragraphs.forEach(p -> content.append(p.text()).append("\n"));

            // 현재 시간을 타임스탬프로 사용
            String timestamp = LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME);

            result.add(new CrawledData(url, title, content.toString(), timestamp));

        } catch (IOException e) {
            e.printStackTrace();
        }

        return result;
    }
}

5. Selenium을 이용한 동적 웹사이트 크롤링 구현

package com.example.webcrawler.service.impl;

import com.example.webcrawler.model.CrawledData;
import com.example.webcrawler.service.CrawlerService;
import io.github.bonigarcia.wdm.WebDriverManager;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.springframework.stereotype.Service;

import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;

@Service
public class SeleniumCrawlerService implements CrawlerService {

    @Override
    public List<CrawledData> crawlWebsite(String url) {
        List<CrawledData> result = new ArrayList<>();

        // WebDriver 설정
        WebDriverManager.chromedriver().setup();
        ChromeOptions options = new ChromeOptions();
        options.addArguments("--headless");  // 브라우저 창 없이 실행
        options.addArguments("--disable-gpu");
        options.addArguments("--no-sandbox");

        WebDriver driver = null;

        try {
            driver = new ChromeDriver(options);
            driver.get(url);

            // 페이지 로딩 기다리기 (필요에 따라 시간 조정)
            Thread.sleep(2000);

            // 페이지 제목
            String title = driver.getTitle();

            // 본문 내용 (예: 모든 p 태그)
            StringBuilder content = new StringBuilder();
            List<WebElement> paragraphs = driver.findElements(By.tagName("p"));
            for (WebElement paragraph : paragraphs) {
                content.append(paragraph.getText()).append("\n");
            }

            // 현재 시간을 타임스탬프로 사용
            String timestamp = LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME);

            result.add(new CrawledData(url, title, content.toString(), timestamp));

        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            if (driver != null) {
                driver.quit();
            }
        }

        return result;
    }
}

6. 크롤링 팩토리 서비스 (정적/동적 사이트 모두 처리)

package com.example.webcrawler.service;

import com.example.webcrawler.service.impl.JsoupCrawlerService;
import com.example.webcrawler.service.impl.SeleniumCrawlerService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

@Service
public class CrawlerFactory {

    private final JsoupCrawlerService jsoupCrawlerService;
    private final SeleniumCrawlerService seleniumCrawlerService;

    @Autowired
    public CrawlerFactory(JsoupCrawlerService jsoupCrawlerService, SeleniumCrawlerService seleniumCrawlerService) {
        this.jsoupCrawlerService = jsoupCrawlerService;
        this.seleniumCrawlerService = seleniumCrawlerService;
    }

    public CrawlerService getCrawlerService(boolean isDynamicSite) {
        if (isDynamicSite) {
            return seleniumCrawlerService;
        } else {
            return jsoupCrawlerService;
        }
    }
}

7. 컨트롤러 구현

package com.example.webcrawler.controller;

import com.example.webcrawler.model.CrawledData;
import com.example.webcrawler.service.CrawlerFactory;
import com.example.webcrawler.service.CrawlerService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.*;

import java.util.List;

@RestController
@RequestMapping("/api/crawler")
public class CrawlerController {

    private final CrawlerFactory crawlerFactory;

    @Autowired
    public CrawlerController(CrawlerFactory crawlerFactory) {
        this.crawlerFactory = crawlerFactory;
    }

    @GetMapping("/crawl")
    public ResponseEntity<List<CrawledData>> crawlWebsite(
            @RequestParam String url,
            @RequestParam(defaultValue = "false") boolean isDynamicSite) {

        CrawlerService crawlerService = crawlerFactory.getCrawlerService(isDynamicSite);
        List<CrawledData> crawledData = crawlerService.crawlWebsite(url);

        return ResponseEntity.ok(crawledData);
    }
}

8. 스케줄링을 통한 자동 크롤링 서비스

package com.example.webcrawler.service;

import com.example.webcrawler.model.CrawledData;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Service;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

@Service
public class ScheduledCrawlerService {

    private final CrawlerFactory crawlerFactory;

    // 크롤링할 사이트 목록
    private final Map<String, Boolean> targetSites = new HashMap<>();

    // 크롤링 결과 저장소
    private final List<CrawledData> crawlingResults = new ArrayList<>();

    @Autowired
    public ScheduledCrawlerService(CrawlerFactory crawlerFactory) {
        this.crawlerFactory = crawlerFactory;

        // 크롤링할 사이트 등록 (URL, 동적 여부)
        targetSites.put("https://example.com", false);
        targetSites.put("https://dynamic-site-example.com", true);
        // 더 많은 사이트 추가 가능
    }

    @Scheduled(fixedRate = 3600000) // 1시간마다 실행
    public void scheduledCrawling() {
        crawlingResults.clear(); // 이전 결과 초기화

        for (Map.Entry<String, Boolean> site : targetSites.entrySet()) {
            String url = site.getKey();
            boolean isDynamic = site.getValue();

            CrawlerService crawler = crawlerFactory.getCrawlerService(isDynamic);
            List<CrawledData> result = crawler.crawlWebsite(url);

            crawlingResults.addAll(result);
        }

        // 여기서 크롤링 결과를 DB에 저장하거나 다른 처리를 할 수 있음
        saveOrProcessResults(crawlingResults);
    }

    private void saveOrProcessResults(List<CrawledData> results) {
        // 데이터베이스 저장 또는 추가 처리 로직
        System.out.println("크롤링 완료: " + results.size() + "개의 데이터가 수집되었습니다.");
    }

    // 결과 조회용 메서드
    public List<CrawledData> getCrawlingResults() {
        return new ArrayList<>(crawlingResults);
    }
}

9. 메인 애플리케이션 클래스

package com.example.webcrawler;

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;

@SpringBootApplication
@EnableScheduling
public class WebCrawlerApplication {

    public static void main(String[] args) {
        SpringApplication.run(WebCrawlerApplication.class, args);
    }
}

10. 사용 예시 (application.properties)

# 애플리케이션 설정
server.port=8080

# 크롤링 쓰레드풀 설정
spring.task.scheduling.pool.size=5

# 로깅 설정
logging.level.com.example.webcrawler=INFO

추가 기능 확장 방법

데이터베이스 저장: JPA를 사용하여 크롤링한 데이터를 데이터베이스에 저장
에러 처리 개선: 예외 처리 및 재시도 메커니즘 구현
프록시 추가: 차단 방지를 위한 프록시 순환 기능
이미지/파일 다운로드: 크롤링 중 발견된 이미지나 파일을 함께 저장
사이트맵 기반 크롤링: XML 사이트맵을 활용한 효율적인 크롤링

이 코드는 기본적인 웹 크롤링 기능을 제공하며, 필요에 따라 확장할 수 있습니다. 다양한 웹사이트를 효과적으로 크롤링할 수 있는 기본 구조를 갖추고 있습니다

저작자표시 (새창열림)

@위피M :: ChatGPT로 여는 새로운 세상!!

ChatGPT, 블록체인, 자바, 맥북, 인터넷, 컴퓨터 정보를 공유합니다.

포스팅이 좋았다면 "좋아요❤️" 또는 "구독👍🏻" 해주세요!