Files
crawlmanager/src/services/crawler.js
T
chpark 52b1c40132 fix: 사이트 관리 페이지 개선 + 디스커버리 크롤링 + 광고 슬롯 선택화
- 디스커버리 크롤링: 태그/목록 페이지 URL에서 최신 글 링크 자동 탐색
  (parse_rules.discovery.link_selector로 Read more 링크 찾기)
- AdSense 슬롯 ID 선택사항: client_id만 있으면 자동 광고 동작
- 사이트 관리: 저장 후 목록 즉시 갱신 (await loadSites)
- 사이트 관리: 크롤링 스케줄 설정 UI 추가 (크론 프리셋 버튼)
- 사이트 관리: 미리보기 버튼 추가 (렌더링 + 파싱 데이터 확인)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-27 01:05:21 +09:00

355 lines
16 KiB
JavaScript

const axios = require('axios');
const cheerio = require('cheerio');
const https = require('https');
const db = require('../db');
// SSL 인증서 무시 (자체 서명 등)
const axiosInstance = axios.create({
httpsAgent: new https.Agent({ rejectUnauthorized: false }),
timeout: 30000,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
},
});
/**
* 사이트를 크롤링하고 DB에 저장
*/
async function crawlSite(siteId) {
const { rows } = await db.query('SELECT * FROM sites WHERE id = $1', [siteId]);
if (rows.length === 0) throw new Error(`Site ${siteId} not found`);
const site = rows[0];
const parseRules = site.parse_rules || {};
await logCrawl(siteId, 'crawl_start', `크롤링 시작: ${site.url}`);
try {
// 1. 실제 크롤링할 URL 결정 (디스커버리 or 직접)
let targetUrl = site.url;
if (parseRules.discovery) {
// 디스커버리: 목록 페이지에서 최신 글 URL을 자동 탐색
await logCrawl(siteId, 'discovery', `목록 페이지에서 최신 글 탐색: ${site.url}`);
const listResponse = await axiosInstance.get(site.url);
const $list = cheerio.load(listResponse.data);
const selector = parseRules.discovery.link_selector || 'a.read-more, a.more-link, .entry-title a, article a';
const linkEl = $list(selector).first();
if (linkEl.length > 0) {
const href = linkEl.attr('href');
if (href) {
// 상대 URL -> 절대 URL
targetUrl = new URL(href, site.url).toString();
await logCrawl(siteId, 'discovery', `최신 글 발견: ${targetUrl}`);
}
} else {
await logCrawl(siteId, 'discovery_warn', `최신 글 링크를 찾을 수 없음. 원본 URL 사용: ${site.url}`);
}
}
// 2. HTML 가져오기
const response = await axiosInstance.get(targetUrl);
const rawHtml = response.data;
// 3. 파싱 규칙에 따라 데이터 추출
const parsedData = parseHtml(rawHtml, parseRules);
parsedData.meta._crawled_url = targetUrl;
// 4. 렌더링용 HTML 생성
const adsenseConfig = await getAdsenseConfig(site.adsense_config_id);
const renderedHtml = renderPublicPage(site, parsedData, adsenseConfig);
// 5. DB 저장
await db.query(
`INSERT INTO crawl_results (site_id, raw_html, parsed_data, rendered_html, status)
VALUES ($1, $2, $3, $4, 'success')`,
[siteId, rawHtml, JSON.stringify(parsedData), renderedHtml]
);
// 6. 사이트 최종 크롤링 시간 업데이트
await db.query(
'UPDATE sites SET last_crawled_at = NOW(), updated_at = NOW() WHERE id = $1',
[siteId]
);
await logCrawl(siteId, 'crawl_success', `크롤링 완료 (${targetUrl}). ${parsedData.items?.length || 0}개 항목 추출`);
return { success: true, itemCount: parsedData.items?.length || 0, crawledUrl: targetUrl };
} catch (err) {
await db.query(
`INSERT INTO crawl_results (site_id, status, error_message)
VALUES ($1, 'error', $2)`,
[siteId, err.message]
);
await logCrawl(siteId, 'crawl_error', err.message);
throw err;
}
}
/**
* HTML 파싱
*
* parse_rules 형식:
* {
* "discovery": { // (선택) 목록 페이지에서 최신 글 자동 탐색
* "link_selector": "a.read-more" // "Read more" 링크 CSS 셀렉터
* },
* "container": "table.easy-table tbody tr",
* "fields": { ... },
* "meta": { ... }
* }
*/
function parseHtml(html, rules) {
const $ = cheerio.load(html);
const result = { items: [], meta: {} };
// 메타 정보 추출
if (rules.meta) {
for (const [key, rule] of Object.entries(rules.meta)) {
result.meta[key] = extractValue($, $(rule.selector).first(), rule);
}
}
// 항목 추출
if (rules.container && rules.fields) {
$(rules.container).each((idx, el) => {
const item = {};
let hasData = false;
for (const [key, rule] of Object.entries(rules.fields)) {
const target = $(el).find(rule.selector).first();
item[key] = extractValue($, target, rule);
if (item[key]) hasData = true;
}
// 비활성(취소선) 체크
const rowHtml = $(el).html() || '';
item._inactive = rowHtml.includes('<del>');
if (hasData) {
item._index = idx + 1;
result.items.push(item);
}
});
}
// 규칙이 없으면 기본 정보만
if (!rules.container) {
result.meta.title = $('title').text().trim();
result.meta.description = $('meta[name="description"]').attr('content') || '';
result.meta.rawTextPreview = $('body').text().trim().substring(0, 500);
}
return result;
}
function extractValue($, el, rule) {
if (!el || el.length === 0) return '';
switch (rule.type) {
case 'attr':
return el.attr(rule.attr) || '';
case 'html':
return el.html() || '';
case 'text':
default:
return el.text().trim();
}
}
/**
* 공개 페이지용 HTML 렌더링
*/
function renderPublicPage(site, parsedData, adsenseConfig) {
const items = parsedData.items || [];
const meta = parsedData.meta || {};
const ads = adsenseConfig || {};
const now = new Date().toLocaleString('ko-KR', { timeZone: 'Asia/Seoul' });
const activeItems = items.filter(i => !i._inactive);
const inactiveItems = items.filter(i => i._inactive);
// 순위 카드 HTML 생성
let cardsHtml = '';
activeItems.forEach((item, idx) => {
const rank = item.rank || item._index || (idx + 1);
const rankClass = rank == 1 ? 'r1' : rank == 2 ? 'r2' : rank == 3 ? 'r3' : '';
const stars = (item.features || '').match(/★/g);
const starCount = stars ? stars.length : 0;
const tagText = (item.features || '').replace(/★/g, '').trim();
const starsHtml = starCount > 0 ? `<span class="stars">${'&#x2605;'.repeat(starCount)}</span>` : '';
const tagHtml = tagText ? `<span class="feature-tag">${escapeHtml(tagText)}</span>` : '';
cardsHtml += `
<a href="${escapeHtml(item.url || item.url_text || '#')}" class="rank-card" target="_blank" rel="noopener noreferrer nofollow">
<div class="rank-num ${rankClass}">${rank}</div>
<div class="rank-body">
<div class="rank-name">${escapeHtml(item.name || '')}</div>
<div class="rank-url">${escapeHtml(item.url_text || item.url || '')}</div>
${(starsHtml || tagHtml) ? `<div class="rank-features">${starsHtml}${tagHtml}</div>` : ''}
</div>
<span class="rank-arrow">&#x203a;</span>
</a>`;
// 5번째 뒤 중간 광고
if (idx === 4 && ads.client_id) {
cardsHtml += renderAdBlock(ads.client_id, ads.slots?.middle);
}
});
// 비활성 사이트
let inactiveHtml = '';
if (inactiveItems.length > 0) {
inactiveHtml = `
<div class="section-header" style="margin-top:2rem;">
<h2 style="color:var(--text-muted);">접속 불가 사이트</h2>
<span class="badge" style="background:var(--danger);">확인 필요</span>
</div>
<div class="rank-list">
${inactiveItems.map(item => `
<div class="rank-card inactive">
<div class="rank-num">&#x2717;</div>
<div class="rank-body">
<div class="rank-name">${escapeHtml(item.name || '')}</div>
<div class="rank-url">${escapeHtml(item.url_text || '')}</div>
</div>
<span class="rank-arrow" style="opacity:0.3;">&#x203a;</span>
</div>
`).join('')}
</div>`;
}
// AdSense: client_id만 있으면 자동광고 동작
const adsenseScript = ads.client_id
? `<script async src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js?client=${escapeHtml(ads.client_id)}" crossorigin="anonymous"></script>`
: '';
const topAd = ads.client_id ? renderAdBlock(ads.client_id, ads.slots?.top) : '';
const bottomAd = ads.client_id ? renderAdBlock(ads.client_id, ads.slots?.bottom) : '';
return `<!DOCTYPE html>
<html lang="ko">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="description" content="${escapeHtml(site.description || '')}">
<title>${escapeHtml(site.name || 'Torrent Rank')}</title>
${adsenseScript}
<link rel="icon" href="data:image/svg+xml,<svg xmlns=%22http://www.w3.org/2000/svg%22 viewBox=%220 0 100 100%22><text y=%22.9em%22 font-size=%2290%22>&#x1f3af;</text></svg>">
<style>
:root{--primary:#6c5ce7;--primary-light:#a29bfe;--bg:#0a0a1a;--bg-card:#12122a;--bg-card-hover:#1a1a3e;--text:#e0e0ee;--text-muted:#7878aa;--accent:#00cec9;--gold:#ffd700;--silver:#c0c0c0;--bronze:#cd7f32;--danger:#ff6b6b;--border:#1e1e44;--star:#f9ca24}
*{margin:0;padding:0;box-sizing:border-box}
body{font-family:-apple-system,BlinkMacSystemFont,'Segoe UI','Noto Sans KR',sans-serif;background:var(--bg);color:var(--text);line-height:1.6;min-height:100vh}
.header{background:linear-gradient(135deg,#0a0a2e 0%,#1a0a3e 50%,#0a1a3e 100%);padding:2.5rem 1rem 2rem;text-align:center;border-bottom:1px solid var(--border);position:relative;overflow:hidden}
.header h1{font-size:1.8rem;font-weight:800;color:#fff;position:relative;z-index:1}
.version-badge{display:inline-block;background:var(--accent);color:#000;font-size:.7rem;font-weight:700;padding:.2rem .6rem;border-radius:12px;margin-left:.5rem;vertical-align:middle}
.sub-info{color:var(--text-muted);font-size:.82rem;margin-top:.6rem;position:relative;z-index:1}
.stats-row{display:inline-flex;gap:1rem;margin-top:1rem;position:relative;z-index:1;flex-wrap:wrap;justify-content:center}
.stat-chip{background:rgba(255,255,255,.05);border:1px solid rgba(255,255,255,.08);padding:.35rem .9rem;border-radius:20px;font-size:.75rem;color:var(--accent)}
.container{max-width:960px;margin:0 auto;padding:1.5rem 1rem}
.ad-box{background:var(--bg-card);border:1px dashed var(--border);border-radius:10px;padding:.8rem;margin:1.2rem 0;text-align:center;min-height:100px}
.ad-box .ad-label{font-size:.65rem;color:var(--text-muted);margin-bottom:.3rem;text-transform:uppercase;letter-spacing:1px}
.section-header{display:flex;align-items:center;gap:.6rem;margin:1.8rem 0 1rem;padding-bottom:.6rem;border-bottom:2px solid var(--border)}
.section-header h2{font-size:1.15rem;font-weight:700;color:#fff}
.badge{background:var(--primary);color:#fff;font-size:.7rem;padding:.15rem .5rem;border-radius:10px}
.rank-list{display:flex;flex-direction:column;gap:.6rem}
.rank-card{display:grid;grid-template-columns:48px 1fr auto;align-items:center;gap:1rem;background:var(--bg-card);border:1px solid var(--border);border-radius:14px;padding:1rem 1.2rem;transition:all .2s;text-decoration:none;color:inherit}
.rank-card:hover{background:var(--bg-card-hover);border-color:var(--primary);transform:translateY(-2px);box-shadow:0 8px 25px rgba(108,92,231,.15)}
.rank-card.inactive{opacity:.45;border-style:dashed}
.rank-num{width:48px;height:48px;border-radius:14px;display:flex;align-items:center;justify-content:center;font-weight:800;font-size:1.1rem;background:var(--border);color:var(--text-muted);flex-shrink:0}
.rank-num.r1{background:linear-gradient(135deg,#ffd700,#f0c800);color:#1a1a00;box-shadow:0 4px 15px rgba(255,215,0,.3)}
.rank-num.r2{background:linear-gradient(135deg,#e0e0e0,#b0b0b0);color:#1a1a1a;box-shadow:0 4px 12px rgba(192,192,192,.2)}
.rank-num.r3{background:linear-gradient(135deg,#cd7f32,#b06820);color:#fff;box-shadow:0 4px 12px rgba(205,127,50,.2)}
.rank-body{min-width:0}
.rank-name{font-size:1.05rem;font-weight:700;color:#fff;margin-bottom:.2rem}
.rank-card.inactive .rank-name{text-decoration:line-through;color:var(--text-muted)}
.rank-url{font-size:.78rem;color:var(--accent);word-break:break-all;opacity:.85}
.rank-features{display:inline-flex;align-items:center;gap:.3rem;margin-top:.3rem}
.feature-tag{background:rgba(108,92,231,.15);color:var(--primary-light);font-size:.7rem;padding:.15rem .5rem;border-radius:6px;font-weight:600}
.stars{color:var(--star);font-size:.8rem;letter-spacing:1px}
.rank-arrow{color:var(--text-muted);font-size:1.3rem;transition:all .2s;flex-shrink:0}
.rank-card:hover .rank-arrow{color:var(--accent);transform:translateX(3px)}
.notice-box{background:linear-gradient(135deg,rgba(255,107,107,.08),rgba(255,107,107,.03));border:1px solid rgba(255,107,107,.2);border-radius:12px;padding:1.2rem 1.5rem;margin:1.5rem 0;font-size:.85rem;line-height:1.7}
.notice-box h3{color:var(--danger);font-size:.9rem;margin-bottom:.5rem}
.notice-box p{color:var(--text-muted)}
.footer{text-align:center;padding:2rem 1rem;margin-top:2rem;border-top:1px solid var(--border);color:var(--text-muted);font-size:.75rem}
.footer a{color:var(--primary-light);text-decoration:none}
@media(max-width:640px){.header h1{font-size:1.3rem}.version-badge{display:block;margin:.5rem auto 0;width:fit-content}.rank-card{grid-template-columns:40px 1fr auto;padding:.8rem;gap:.7rem}.rank-num{width:40px;height:40px;font-size:.95rem;border-radius:10px}.rank-name{font-size:.95rem}}
</style>
</head>
<body>
<header class="header">
<h1>${escapeHtml(site.name)}<span class="version-badge">${escapeHtml(meta.title?.match(/ver\.?([\d.]+)/)?.[1] || now.split(' ')[0])}</span></h1>
<p class="sub-info">업데이트: ${now}</p>
<div class="stats-row">
<span class="stat-chip">${activeItems.length}개 사이트</span>
<span class="stat-chip">비회원제 (가입 불필요)</span>
<span class="stat-chip">자동 갱신</span>
</div>
</header>
<div class="container">
${topAd}
<div class="notice-box">
<h3>&#x26a0;&#xfe0f; 이용 시 주의사항</h3>
<p>토렌트 다운로드 시 동시에 업로드에도 참여하게 됩니다. 저작권이 있는 파일을 다운로드할 경우 법적 책임이 발생할 수 있으며, 압축파일(*.zip) 형태의 토렌트는 악성코드 포함 가능성이 있으니 주의하세요.</p>
</div>
<div class="section-header">
<h2>추천 토렌트 사이트 순위</h2>
<span class="badge">비회원제</span>
</div>
<div class="rank-list">
${cardsHtml || '<div style="text-align:center;padding:3rem;color:var(--text-muted)"><p>아직 수집된 데이터가 없습니다.</p></div>'}
</div>
${inactiveHtml}
${bottomAd}
</div>
<footer class="footer">
<p>&copy; ${new Date().getFullYear()} ${escapeHtml(site.name)}</p>
<p style="margin-top:.3rem;font-size:.7rem;">본 사이트는 정보 제공 목적이며, 불법 다운로드를 조장하지 않습니다.</p>
</footer>
</body>
</html>`;
}
/**
* 광고 블록 렌더링 - slotId 없으면 자동 광고만
*/
function renderAdBlock(clientId, slotId) {
if (!clientId) return '';
// slotId가 없으면 자동 광고 (AdSense가 알아서 배치)
const slotAttr = slotId ? ` data-ad-slot="${escapeHtml(slotId)}"` : '';
return `
<div class="ad-box">
<ins class="adsbygoogle" style="display:block" data-ad-client="${escapeHtml(clientId)}"${slotAttr} data-ad-format="auto" data-full-width-responsive="true"></ins>
<script>(adsbygoogle = window.adsbygoogle || []).push({});</script>
</div>`;
}
function escapeHtml(str) {
if (!str) return '';
return String(str)
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#039;');
}
async function getAdsenseConfig(configId) {
if (!configId) return null;
const { rows } = await db.query('SELECT * FROM adsense_configs WHERE id = $1 AND is_active = TRUE', [configId]);
return rows[0] || null;
}
async function logCrawl(siteId, action, message) {
await db.query(
'INSERT INTO crawl_logs (site_id, action, message) VALUES ($1, $2, $3)',
[siteId, action, message]
);
}
module.exports = { crawlSite, parseHtml };