diff --git a/Dockerfile b/Dockerfile index 002ebf2..1c52dfa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,12 @@ FROM node:20-alpine +# Puppeteer용 Chromium 설치 +RUN apk add --no-cache chromium nss freetype harfbuzz ca-certificates ttf-freefont + +# Puppeteer 환경변수 +ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser +ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true + WORKDIR /app COPY package.json package-lock.json* ./ diff --git a/package.json b/package.json index 5b02436..7e1d04f 100644 --- a/package.json +++ b/package.json @@ -18,6 +18,7 @@ "dotenv": "^16.4.0", "https-proxy-agent": "^7.0.0", "cookie-parser": "^1.4.6", - "iconv-lite": "^0.6.3" + "iconv-lite": "^0.6.3", + "puppeteer-core": "^22.0.0" } } diff --git a/src/routes/api.js b/src/routes/api.js index 0fedb16..c3370a7 100644 --- a/src/routes/api.js +++ b/src/routes/api.js @@ -78,8 +78,41 @@ router.delete('/sites/:id', async (req, res) => { router.post('/fetch-page', async (req, res) => { try { - const { url } = req.body; + const { url, browser, wait, login } = req.body; if (!url) return res.status(400).json({ error: 'URL is required' }); + + // 브라우저 모드: Puppeteer로 JS 렌더링 후 HTML 반환 + if (browser) { + const puppeteer = require('puppeteer-core'); + const b = await puppeteer.launch({ + executablePath: process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium-browser', + headless: 'new', + args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu'], + }); + try { + const page = await b.newPage(); + await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); + await page.setViewport({ width: 1280, height: 800 }); + + // 로그인 처리 + if (login && login.steps) { + await page.goto(login.url || url, { waitUntil: 'networkidle2', timeout: 30000 }); + for (const step of login.steps) { + if (step.action === 'type') { await page.waitForSelector(step.selector, {timeout:10000}); await page.type(step.selector, step.value, {delay:50}); } + else if (step.action === 'click') { await page.waitForSelector(step.selector, {timeout:10000}); await page.click(step.selector); } + else if (step.wait) { await page.waitForTimeout(step.wait); } + } + if (login.url && login.url !== url) await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 }); + } else { + await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 }); + } + + await page.waitForTimeout(wait || 3000); + const html = await page.content(); + await b.close(); + return res.json({ html, finalUrl: url }); + } catch (e) { await b.close(); throw e; } + } const axios = require('axios'); const https = require('https'); const iconv = require('iconv-lite'); diff --git a/src/services/crawler.js b/src/services/crawler.js index d936c35..0c51284 100644 --- a/src/services/crawler.js +++ b/src/services/crawler.js @@ -4,6 +4,65 @@ const https = require('https'); const iconv = require('iconv-lite'); const db = require('../db'); +/** + * Puppeteer 브라우저로 페이지 가져오기 (JS 렌더링 + 로그인 지원) + * parse_rules.browser = true 일 때 사용 + */ +async function fetchWithBrowser(url, parseRules) { + const puppeteer = require('puppeteer-core'); + const browser = await puppeteer.launch({ + executablePath: process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium-browser', + headless: 'new', + args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu'], + }); + + try { + const page = await browser.newPage(); + await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); + await page.setViewport({ width: 1280, height: 800 }); + + // 로그인이 필요한 경우 + if (parseRules.login) { + const login = parseRules.login; + await page.goto(login.url || url, { waitUntil: 'networkidle2', timeout: 30000 }); + if (login.wait_before) await page.waitForTimeout(login.wait_before); + + for (const step of (login.steps || [])) { + if (step.action === 'type' && step.selector && step.value) { + await page.waitForSelector(step.selector, { timeout: 10000 }); + await page.type(step.selector, step.value, { delay: 50 }); + } else if (step.action === 'click' && step.selector) { + await page.waitForSelector(step.selector, { timeout: 10000 }); + await page.click(step.selector); + } else if (step.wait) { + await page.waitForTimeout(step.wait); + } + } + + // 로그인 후 대상 페이지로 이동 (login.url과 다른 경우) + if (login.url && login.url !== url) { + await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 }); + } + } else { + await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 }); + } + + // JS 렌더링 대기 + const waitMs = parseRules.wait || 3000; + await page.waitForTimeout(waitMs); + + // 특정 셀렉터가 나타날 때까지 대기 + if (parseRules.wait_for) { + await page.waitForSelector(parseRules.wait_for, { timeout: 15000 }).catch(() => {}); + } + + const html = await page.content(); + return html; + } finally { + await browser.close(); + } +} + // SSL 인증서 무시 (자체 서명 등) const axiosInstance = axios.create({ httpsAgent: new https.Agent({ rejectUnauthorized: false }), @@ -76,9 +135,15 @@ async function crawlSite(siteId) { } } - // 2. HTML 가져오기 (인코딩 자동 감지) - const response = await axiosInstance.get(targetUrl); - const rawHtml = decodeResponse(response.data, response.headers['content-type']); + // 2. HTML 가져오기 (브라우저 모드 or 정적) + let rawHtml; + if (parseRules.browser) { + await logCrawl(siteId, 'browser', `브라우저 모드 크롤링 (wait: ${parseRules.wait || 3000}ms)`); + rawHtml = await fetchWithBrowser(targetUrl, parseRules); + } else { + const response = await axiosInstance.get(targetUrl); + rawHtml = decodeResponse(response.data, response.headers['content-type']); + } // 3. 파싱 규칙에 따라 데이터 추출 const parsedData = parseHtml(rawHtml, parseRules); diff --git a/views/admin/mapper.ejs b/views/admin/mapper.ejs index 2a05edd..6bf5c4a 100644 --- a/views/admin/mapper.ejs +++ b/views/admin/mapper.ejs @@ -26,6 +26,9 @@