diff --git a/Dockerfile b/Dockerfile index 002ebf2..1c52dfa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,12 @@ FROM node:20-alpine +# Puppeteer용 Chromium 설치 +RUN apk add --no-cache chromium nss freetype harfbuzz ca-certificates ttf-freefont + +# Puppeteer 환경변수 +ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser +ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true + WORKDIR /app COPY package.json package-lock.json* ./ diff --git a/package.json b/package.json index 5b02436..7e1d04f 100644 --- a/package.json +++ b/package.json @@ -18,6 +18,7 @@ "dotenv": "^16.4.0", "https-proxy-agent": "^7.0.0", "cookie-parser": "^1.4.6", - "iconv-lite": "^0.6.3" + "iconv-lite": "^0.6.3", + "puppeteer-core": "^22.0.0" } } diff --git a/src/routes/api.js b/src/routes/api.js index 0fedb16..c3370a7 100644 --- a/src/routes/api.js +++ b/src/routes/api.js @@ -78,8 +78,41 @@ router.delete('/sites/:id', async (req, res) => { router.post('/fetch-page', async (req, res) => { try { - const { url } = req.body; + const { url, browser, wait, login } = req.body; if (!url) return res.status(400).json({ error: 'URL is required' }); + + // 브라우저 모드: Puppeteer로 JS 렌더링 후 HTML 반환 + if (browser) { + const puppeteer = require('puppeteer-core'); + const b = await puppeteer.launch({ + executablePath: process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium-browser', + headless: 'new', + args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu'], + }); + try { + const page = await b.newPage(); + await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); + await page.setViewport({ width: 1280, height: 800 }); + + // 로그인 처리 + if (login && login.steps) { + await page.goto(login.url || url, { waitUntil: 'networkidle2', timeout: 30000 }); + for (const step of login.steps) { + if (step.action === 'type') { await page.waitForSelector(step.selector, {timeout:10000}); await page.type(step.selector, step.value, {delay:50}); } + else if (step.action === 'click') { await page.waitForSelector(step.selector, {timeout:10000}); await page.click(step.selector); } + else if (step.wait) { await page.waitForTimeout(step.wait); } + } + if (login.url && login.url !== url) await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 }); + } else { + await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 }); + } + + await page.waitForTimeout(wait || 3000); + const html = await page.content(); + await b.close(); + return res.json({ html, finalUrl: url }); + } catch (e) { await b.close(); throw e; } + } const axios = require('axios'); const https = require('https'); const iconv = require('iconv-lite'); diff --git a/src/services/crawler.js b/src/services/crawler.js index d936c35..0c51284 100644 --- a/src/services/crawler.js +++ b/src/services/crawler.js @@ -4,6 +4,65 @@ const https = require('https'); const iconv = require('iconv-lite'); const db = require('../db'); +/** + * Puppeteer 브라우저로 페이지 가져오기 (JS 렌더링 + 로그인 지원) + * parse_rules.browser = true 일 때 사용 + */ +async function fetchWithBrowser(url, parseRules) { + const puppeteer = require('puppeteer-core'); + const browser = await puppeteer.launch({ + executablePath: process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium-browser', + headless: 'new', + args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu'], + }); + + try { + const page = await browser.newPage(); + await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); + await page.setViewport({ width: 1280, height: 800 }); + + // 로그인이 필요한 경우 + if (parseRules.login) { + const login = parseRules.login; + await page.goto(login.url || url, { waitUntil: 'networkidle2', timeout: 30000 }); + if (login.wait_before) await page.waitForTimeout(login.wait_before); + + for (const step of (login.steps || [])) { + if (step.action === 'type' && step.selector && step.value) { + await page.waitForSelector(step.selector, { timeout: 10000 }); + await page.type(step.selector, step.value, { delay: 50 }); + } else if (step.action === 'click' && step.selector) { + await page.waitForSelector(step.selector, { timeout: 10000 }); + await page.click(step.selector); + } else if (step.wait) { + await page.waitForTimeout(step.wait); + } + } + + // 로그인 후 대상 페이지로 이동 (login.url과 다른 경우) + if (login.url && login.url !== url) { + await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 }); + } + } else { + await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 }); + } + + // JS 렌더링 대기 + const waitMs = parseRules.wait || 3000; + await page.waitForTimeout(waitMs); + + // 특정 셀렉터가 나타날 때까지 대기 + if (parseRules.wait_for) { + await page.waitForSelector(parseRules.wait_for, { timeout: 15000 }).catch(() => {}); + } + + const html = await page.content(); + return html; + } finally { + await browser.close(); + } +} + // SSL 인증서 무시 (자체 서명 등) const axiosInstance = axios.create({ httpsAgent: new https.Agent({ rejectUnauthorized: false }), @@ -76,9 +135,15 @@ async function crawlSite(siteId) { } } - // 2. HTML 가져오기 (인코딩 자동 감지) - const response = await axiosInstance.get(targetUrl); - const rawHtml = decodeResponse(response.data, response.headers['content-type']); + // 2. HTML 가져오기 (브라우저 모드 or 정적) + let rawHtml; + if (parseRules.browser) { + await logCrawl(siteId, 'browser', `브라우저 모드 크롤링 (wait: ${parseRules.wait || 3000}ms)`); + rawHtml = await fetchWithBrowser(targetUrl, parseRules); + } else { + const response = await axiosInstance.get(targetUrl); + rawHtml = decodeResponse(response.data, response.headers['content-type']); + } // 3. 파싱 규칙에 따라 데이터 추출 const parsedData = parseHtml(rawHtml, parseRules); diff --git a/views/admin/mapper.ejs b/views/admin/mapper.ejs index 2a05edd..6bf5c4a 100644 --- a/views/admin/mapper.ejs +++ b/views/admin/mapper.ejs @@ -26,6 +26,9 @@
+
@@ -36,6 +39,28 @@
+ +
+

0 크롤링 옵션

+
+ JS대기 + ms +
+
+ +
+ +
+

1 데이터 타입

@@ -195,9 +220,29 @@ async function fetchPage() { document.getElementById('status-bar').textContent = '페이지 로딩 중...'; try { + var useBrowser = document.getElementById('m-browser').checked; + var waitMs = parseInt(document.getElementById('m-wait').value) || 3000; + var fetchBody = { url: url }; + if (useBrowser) { + fetchBody.browser = true; + fetchBody.wait = waitMs; + // 로그인 설정 + if (document.getElementById('m-login-enable').checked) { + fetchBody.login = { + url: document.getElementById('m-login-url').value.trim() || url, + steps: [ + { action: 'type', selector: document.getElementById('m-login-user-sel').value, value: document.getElementById('m-login-user-val').value }, + { action: 'type', selector: document.getElementById('m-login-pass-sel').value, value: document.getElementById('m-login-pass-val').value }, + { action: 'click', selector: document.getElementById('m-login-btn-sel').value }, + { wait: 2000 } + ] + }; + } + document.getElementById('status-bar').textContent = '브라우저 모드 로딩 중... (JS 렌더링 대기 ' + waitMs + 'ms)'; + } var resp = await fetch('/api/fetch-page', { method: 'POST', headers: {'Content-Type':'application/json'}, credentials: 'same-origin', - body: JSON.stringify({ url: url }) + body: JSON.stringify(fetchBody) }); if (!resp.ok) { var err = await resp.json().catch(function(){return {error:'HTTP '+resp.status}}); throw new Error(err.error || 'HTTP '+resp.status); } var res = await resp.json(); @@ -303,9 +348,39 @@ window.addEventListener('message', function(e) { updateJson(); }); +// === 로그인 필드 토글 === +function toggleLogin() { + document.getElementById('login-fields').style.display = document.getElementById('m-login-enable').checked ? 'block' : 'none'; +} + // === JSON 미리보기 업데이트 === function updateJson() { var rules = {}; + + // 브라우저 모드 + if (document.getElementById('m-browser').checked) { + rules.browser = true; + rules.wait = parseInt(document.getElementById('m-wait').value) || 3000; + } + + // 로그인 + if (document.getElementById('m-login-enable').checked) { + var userSel = document.getElementById('m-login-user-sel').value; + var passSel = document.getElementById('m-login-pass-sel').value; + var btnSel = document.getElementById('m-login-btn-sel').value; + if (userSel && passSel) { + rules.login = { + url: document.getElementById('m-login-url').value.trim(), + steps: [ + { action: 'type', selector: userSel, value: document.getElementById('m-login-user-val').value }, + { action: 'type', selector: passSel, value: document.getElementById('m-login-pass-val').value }, + { action: 'click', selector: btnSel || 'button[type=submit]' }, + { wait: 2000 } + ] + }; + } + } + if (dataType === 'landing') { rules.content_selector = mappings.content_selector || 'body'; rules.remove_selectors = 'script, style, iframe, nav, header, footer, .ad, .ads, .sidebar'; @@ -318,7 +393,6 @@ function updateJson() { rules.fields = {}; ['name','url','url_text','rank','features'].forEach(function(f) { if (mappings[f]) { - // 컨테이너 기준 상대 셀렉터로 변환 var sel = mappings[f].selector; if (containerSelector && sel.indexOf(containerSelector) === 0) { sel = sel.substring(containerSelector.length).replace(/^\s*>\s*/, '');