commit c61f10560fd34ed42a8c28dd6cbe1d4aa2be1e4f Author: chpark Date: Fri Mar 27 00:44:19 2026 +0900 init: 크롤링 관리 솔루션 초기 구성 - Express.js 기반 관리자 페이지 (사이트/크롤링/AdSense/도메인 관리) - PostgreSQL 16 + Docker Compose (Traefik 연동) - 크롤러: axios + cheerio 기반 HTML 파싱 - 스케줄러: node-cron 기반 자동 크롤링 - 공개 사이트: slug/도메인 기반 DB에서 렌더링 HTML 서빙 - 도메인: admin.startover.co.kr Co-Authored-By: Claude Opus 4.6 (1M context) diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..db702f0 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,4 @@ +node_modules +.env +.git +*.md diff --git a/.env b/.env new file mode 100644 index 0000000..96b6897 --- /dev/null +++ b/.env @@ -0,0 +1,14 @@ +# 로컬 개발용 환경변수 +PORT=3000 +NODE_ENV=development + +# PostgreSQL +DB_HOST=crawl-manager-db +DB_PORT=5432 +DB_NAME=crawler +DB_USER=crawler +DB_PASSWORD=qlalfqjsgh11!! + +# 관리자 인증 +ADMIN_USER=chpark@admin.co.kr +ADMIN_PASS=1313Qkrckd!!!!!! diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..fdcf751 --- /dev/null +++ b/.env.example @@ -0,0 +1,14 @@ +# 서버 설정 +PORT=3000 +NODE_ENV=production + +# PostgreSQL 설정 +DB_HOST=db +DB_PORT=5432 +DB_NAME=crawl_manager +DB_USER=crawl_admin +DB_PASSWORD=change_this_password + +# 관리자 인증 (Basic Auth) +ADMIN_USER=admin +ADMIN_PASS=change_this_password diff --git a/.env.production b/.env.production new file mode 100644 index 0000000..121d65c --- /dev/null +++ b/.env.production @@ -0,0 +1,19 @@ +# 서버 설정 +PORT=3000 +NODE_ENV=production + +# PostgreSQL 설정 (Docker postgres 컨테이너용) +POSTGRES_DB=crawler +POSTGRES_USER=crawler +POSTGRES_PASSWORD=qlalfqjsgh11!! + +# App에서 사용하는 DB 연결 (컨테이너 이름 기반) +DB_HOST=crawl-manager-db +DB_PORT=5432 +DB_NAME=crawler +DB_USER=crawler +DB_PASSWORD=qlalfqjsgh11!! + +# 관리자 인증 (Basic Auth) +ADMIN_USER=chpark@admin.co.kr +ADMIN_PASS=1313Qkrckd!!!!!! diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..605a06c --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +node_modules/ +crawl-pgdata/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..002ebf2 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,12 @@ +FROM node:20-alpine + +WORKDIR /app + +COPY package.json package-lock.json* ./ +RUN npm install --production + +COPY . . + +EXPOSE 3000 + +CMD ["node", "src/app.js"] diff --git a/deploy.bat b/deploy.bat new file mode 100644 index 0000000..48e57c9 --- /dev/null +++ b/deploy.bat @@ -0,0 +1,50 @@ +@echo off +REM ========================================== +REM Crawl Manager 서버 배포 스크립트 (Windows) +REM ========================================== +REM 사용법: deploy.bat +REM 사전 준비: SSH 키 설정 또는 비밀번호 인증 +REM ========================================== + +REM ===== 설정 (본인 환경에 맞게 수정) ===== +SET SERVER_USER=root +SET SERVER_HOST=your-server-ip +SET SERVER_PORT=22 +SET REMOTE_DIR=/home/crawl-manager +SET PROJECT_DIR=%~dp0 + +echo. +echo ========================================== +echo Crawl Manager 배포 시작 +echo ========================================== +echo. + +REM 1. 서버에 디렉토리 생성 +echo [1/4] 서버 디렉토리 준비... +ssh -p %SERVER_PORT% %SERVER_USER%@%SERVER_HOST% "mkdir -p %REMOTE_DIR%/postgres_data %REMOTE_DIR%/app_data" + +REM 2. 파일 전송 (scp) +echo [2/4] 파일 전송 중... +scp -P %SERVER_PORT% -r "%PROJECT_DIR%src" %SERVER_USER%@%SERVER_HOST%:%REMOTE_DIR%/ +scp -P %SERVER_PORT% -r "%PROJECT_DIR%views" %SERVER_USER%@%SERVER_HOST%:%REMOTE_DIR%/ +scp -P %SERVER_PORT% -r "%PROJECT_DIR%public" %SERVER_USER%@%SERVER_HOST%:%REMOTE_DIR%/ +scp -P %SERVER_PORT% "%PROJECT_DIR%package.json" %SERVER_USER%@%SERVER_HOST%:%REMOTE_DIR%/ +scp -P %SERVER_PORT% "%PROJECT_DIR%Dockerfile" %SERVER_USER%@%SERVER_HOST%:%REMOTE_DIR%/ +scp -P %SERVER_PORT% "%PROJECT_DIR%docker-compose.yml" %SERVER_USER%@%SERVER_HOST%:%REMOTE_DIR%/ +scp -P %SERVER_PORT% "%PROJECT_DIR%.env.production" %SERVER_USER%@%SERVER_HOST%:%REMOTE_DIR%/ +scp -P %SERVER_PORT% "%PROJECT_DIR%.dockerignore" %SERVER_USER%@%SERVER_HOST%:%REMOTE_DIR%/ + +REM 3. Docker 빌드 & 실행 +echo [3/4] Docker 빌드 및 실행... +ssh -p %SERVER_PORT% %SERVER_USER%@%SERVER_HOST% "cd %REMOTE_DIR% && docker compose down && docker compose build --no-cache && docker compose up -d" + +REM 4. 상태 확인 +echo [4/4] 상태 확인... +ssh -p %SERVER_PORT% %SERVER_USER%@%SERVER_HOST% "cd %REMOTE_DIR% && docker compose ps" + +echo. +echo ========================================== +echo 배포 완료! +echo 관리자: https://admin.startover.co.kr/admin +echo ========================================== +pause diff --git a/deploy.ps1 b/deploy.ps1 new file mode 100644 index 0000000..3d1b6fa --- /dev/null +++ b/deploy.ps1 @@ -0,0 +1,67 @@ +# ========================================== +# Crawl Manager 서버 배포 스크립트 (PowerShell) +# ========================================== +# 사용법: .\deploy.ps1 +# 또는: .\deploy.ps1 -ServerHost "1.2.3.4" -ServerUser "root" +# ========================================== + +param( + [string]$ServerUser = "root", + [string]$ServerHost = "your-server-ip", + [int]$ServerPort = 22, + [string]$RemoteDir = "/home/crawl-manager" +) + +$ProjectDir = $PSScriptRoot + +Write-Host "" +Write-Host "==========================================" -ForegroundColor Cyan +Write-Host " Crawl Manager 배포 시작" -ForegroundColor Cyan +Write-Host "==========================================" -ForegroundColor Cyan +Write-Host "" + +# 1. 서버 디렉토리 생성 +Write-Host "[1/4] 서버 디렉토리 준비..." -ForegroundColor Yellow +ssh -p $ServerPort "$ServerUser@$ServerHost" "mkdir -p $RemoteDir/postgres_data $RemoteDir/app_data" + +# 2. 파일 전송 +Write-Host "[2/4] 파일 전송 중..." -ForegroundColor Yellow +$filesToCopy = @( + "src", "views", "public" +) +$singleFiles = @( + "package.json", "Dockerfile", "docker-compose.yml", + ".env.production", ".dockerignore" +) + +foreach ($dir in $filesToCopy) { + Write-Host " - $dir/" -ForegroundColor Gray + scp -P $ServerPort -r "$ProjectDir/$dir" "$ServerUser@${ServerHost}:$RemoteDir/" +} + +foreach ($file in $singleFiles) { + $filePath = "$ProjectDir/$file" + if (Test-Path $filePath) { + Write-Host " - $file" -ForegroundColor Gray + scp -P $ServerPort "$filePath" "$ServerUser@${ServerHost}:$RemoteDir/" + } +} + +# 3. Docker 빌드 & 실행 +Write-Host "[3/4] Docker 빌드 및 실행..." -ForegroundColor Yellow +ssh -p $ServerPort "$ServerUser@$ServerHost" @" +cd $RemoteDir +docker compose down +docker compose build --no-cache +docker compose up -d +"@ + +# 4. 상태 확인 +Write-Host "[4/4] 상태 확인..." -ForegroundColor Yellow +ssh -p $ServerPort "$ServerUser@$ServerHost" "cd $RemoteDir && docker compose ps && echo '' && docker compose logs --tail=20 crawl-manager" + +Write-Host "" +Write-Host "==========================================" -ForegroundColor Green +Write-Host " 배포 완료!" -ForegroundColor Green +Write-Host " 관리자: https://admin.startover.co.kr/admin" -ForegroundColor Green +Write-Host "==========================================" -ForegroundColor Green diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml new file mode 100644 index 0000000..debba0d --- /dev/null +++ b/docker-compose.dev.yml @@ -0,0 +1,43 @@ +# 로컬 개발용 (Windows) +# 사용법: docker compose -f docker-compose.dev.yml up -d + +services: + crawl-manager: + build: + context: . + dockerfile: Dockerfile + container_name: crawl-manager + restart: unless-stopped + env_file: + - .env + ports: + - "3000:3000" + depends_on: + crawl-manager-db: + condition: service_healthy + volumes: + - ./src:/app/src + - ./views:/app/views + - ./public:/app/public + + crawl-manager-db: + image: postgres:16-alpine + container_name: crawl-manager-db + restart: unless-stopped + environment: + POSTGRES_DB: crawler + POSTGRES_USER: crawler + POSTGRES_PASSWORD: "qlalfqjsgh11!!" + volumes: + - crawl-pgdata:/var/lib/postgresql/data + - ./src/migrations/init.sql:/docker-entrypoint-initdb.d/01-init.sql + ports: + - "11137:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -U crawler -d crawler"] + interval: 5s + timeout: 5s + retries: 5 + +volumes: + crawl-pgdata: diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..c80e850 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,45 @@ +services: + crawl-manager: + build: + context: . + dockerfile: Dockerfile + container_name: crawl-manager + restart: always + env_file: + - .env.production + stdin_open: true + tty: true + depends_on: + crawl-manager-db: + condition: service_healthy + volumes: + - /home/crawl-manager/app_data:/app/data + labels: + - traefik.enable=true + - traefik.http.routers.crawl-manager.rule=Host(`admin.startover.co.kr`) + - traefik.http.routers.crawl-manager.entrypoints=websecure,web + - traefik.http.routers.crawl-manager.tls=true + - traefik.http.routers.crawl-manager.tls.certresolver=le + - traefik.http.services.crawl-manager.loadbalancer.server.port=3000 + + crawl-manager-db: + image: postgres:16-alpine + container_name: crawl-manager-db + restart: always + env_file: + - .env.production + volumes: + - /home/crawl-manager/postgres_data:/var/lib/postgresql/data + - ./src/migrations/init.sql:/docker-entrypoint-initdb.d/01-init.sql + ports: + - "11137:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -U crawler -d crawler"] + interval: 10s + timeout: 5s + retries: 5 + +networks: + default: + external: + name: toktork_server_default diff --git a/package.json b/package.json new file mode 100644 index 0000000..1c7af53 --- /dev/null +++ b/package.json @@ -0,0 +1,21 @@ +{ + "name": "crawl-manager", + "version": "1.0.0", + "description": "크롤링 관리 솔루션", + "main": "src/app.js", + "scripts": { + "start": "node src/app.js", + "dev": "node --watch src/app.js", + "migrate": "node src/migrations/run.js" + }, + "dependencies": { + "express": "^4.21.0", + "pg": "^8.13.0", + "cheerio": "^1.0.0", + "node-cron": "^3.0.3", + "axios": "^1.7.0", + "ejs": "^3.1.10", + "dotenv": "^16.4.0", + "https-proxy-agent": "^7.0.0" + } +} diff --git a/run-local.bat b/run-local.bat new file mode 100644 index 0000000..ac066b5 --- /dev/null +++ b/run-local.bat @@ -0,0 +1,35 @@ +@echo off +REM ========================================== +REM Crawl Manager 로컬 실행 (Windows Docker) +REM ========================================== + +cd /d "%~dp0" + +echo. +echo ========================================== +echo Crawl Manager 로컬 실행 +echo ========================================== +echo. + +REM 기존 컨테이너 정리 +echo [1/3] 기존 컨테이너 정리... +docker compose -f docker-compose.dev.yml down + +REM 빌드 & 실행 +echo [2/3] Docker 빌드 및 실행... +docker compose -f docker-compose.dev.yml up -d --build + +REM 상태 확인 +echo [3/3] 상태 확인... +timeout /t 5 /nobreak >nul +docker compose -f docker-compose.dev.yml ps + +echo. +echo ========================================== +echo 실행 완료! +echo 관리자: http://localhost:3000/admin +echo ID: chpark@admin.co.kr +echo DB 외부접속: localhost:11137 +echo ========================================== +echo. +pause diff --git a/src/app.js b/src/app.js new file mode 100644 index 0000000..234ecf2 --- /dev/null +++ b/src/app.js @@ -0,0 +1,95 @@ +require('dotenv').config(); + +const express = require('express'); +const path = require('path'); +const db = require('./db'); +const apiRouter = require('./routes/api'); +const { router: publicRouter, domainRouter } = require('./routes/public'); +const { initScheduler } = require('./services/scheduler'); + +const app = express(); +const PORT = process.env.PORT || 3000; + +// ===== 미들웨어 ===== +app.use(express.json({ limit: '50mb' })); +app.use(express.urlencoded({ extended: true })); + +// 정적 파일 +app.use('/public', express.static(path.join(__dirname, '..', 'public'))); + +// EJS 템플릿 +app.set('view engine', 'ejs'); +app.set('views', path.join(__dirname, '..', 'views')); + +// ===== 도메인 기반 라우팅 (최우선) ===== +app.use(domainRouter); + +// ===== Basic Auth (관리자 영역만) ===== +function adminAuth(req, res, next) { + const authHeader = req.headers.authorization; + if (!authHeader) { + res.setHeader('WWW-Authenticate', 'Basic realm="Crawl Manager Admin"'); + return res.status(401).send('인증이 필요합니다'); + } + + const [user, pass] = Buffer.from(authHeader.split(' ')[1], 'base64').toString().split(':'); + if (user === process.env.ADMIN_USER && pass === process.env.ADMIN_PASS) { + return next(); + } + + res.setHeader('WWW-Authenticate', 'Basic realm="Crawl Manager Admin"'); + res.status(401).send('인증 실패'); +} + +// ===== 관리자 페이지 ===== +app.get('/admin', adminAuth, (req, res) => { + res.render('admin/dashboard'); +}); + +app.get('/admin/sites', adminAuth, (req, res) => { + res.render('admin/sites'); +}); + +app.get('/admin/sites/:id', adminAuth, (req, res) => { + res.render('admin/site-detail', { siteId: req.params.id }); +}); + +app.get('/admin/adsense', adminAuth, (req, res) => { + res.render('admin/adsense'); +}); + +app.get('/admin/domains', adminAuth, (req, res) => { + res.render('admin/domains'); +}); + +app.get('/admin/logs', adminAuth, (req, res) => { + res.render('admin/logs'); +}); + +// ===== API ===== +app.use('/api', adminAuth, apiRouter); + +// ===== 공개 사이트 ===== +app.use('/', publicRouter); + +// ===== 시작 ===== +async function start() { + try { + // DB 연결 확인 + await db.query('SELECT 1'); + console.log('[DB] PostgreSQL 연결 성공'); + + // 스케줄러 초기화 + await initScheduler(); + + app.listen(PORT, '0.0.0.0', () => { + console.log(`[APP] 서버 시작: http://0.0.0.0:${PORT}`); + console.log(`[APP] 관리자: http://localhost:${PORT}/admin`); + }); + } catch (err) { + console.error('[APP] 시작 실패:', err.message); + process.exit(1); + } +} + +start(); diff --git a/src/db.js b/src/db.js new file mode 100644 index 0000000..3e5cb04 --- /dev/null +++ b/src/db.js @@ -0,0 +1,20 @@ +const { Pool } = require('pg'); + +const pool = new Pool({ + host: process.env.DB_HOST || 'localhost', + port: parseInt(process.env.DB_PORT || '5432'), + database: process.env.DB_NAME || 'crawl_manager', + user: process.env.DB_USER || 'crawl_admin', + password: process.env.DB_PASSWORD || '', + max: 20, + idleTimeoutMillis: 30000, +}); + +pool.on('error', (err) => { + console.error('[DB] Unexpected error on idle client', err); +}); + +module.exports = { + query: (text, params) => pool.query(text, params), + pool, +}; diff --git a/src/migrations/init.sql b/src/migrations/init.sql new file mode 100644 index 0000000..88eea6a --- /dev/null +++ b/src/migrations/init.sql @@ -0,0 +1,89 @@ +-- 크롤링 관리 솔루션 DB 스키마 + +-- 크롤링 대상 사이트 +CREATE TABLE IF NOT EXISTS sites ( + id SERIAL PRIMARY KEY, + name VARCHAR(200) NOT NULL, + url TEXT NOT NULL, + description TEXT DEFAULT '', + -- CSS 셀렉터 / 파싱 규칙 (JSON) + parse_rules JSONB DEFAULT '{}', + -- 크론 스케줄 (예: "0 6 * * *" = 매일 06시) + cron_schedule VARCHAR(100) DEFAULT '', + -- 스케줄 활성화 여부 + schedule_active BOOLEAN DEFAULT FALSE, + -- 공개 사이트 슬러그 (도메인별 매핑용) + slug VARCHAR(100) UNIQUE, + -- 공개 페이지 템플릿 이름 + template VARCHAR(100) DEFAULT 'default', + -- 상태 + status VARCHAR(20) DEFAULT 'active', + last_crawled_at TIMESTAMPTZ, + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +); + +-- 크롤링 결과 저장 +CREATE TABLE IF NOT EXISTS crawl_results ( + id SERIAL PRIMARY KEY, + site_id INTEGER NOT NULL REFERENCES sites(id) ON DELETE CASCADE, + -- 원본 HTML 전체 + raw_html TEXT, + -- 파싱된 데이터 (JSON) + parsed_data JSONB DEFAULT '[]', + -- 최종 렌더링용 HTML + rendered_html TEXT, + -- 크롤링 상태 + status VARCHAR(20) DEFAULT 'success', + error_message TEXT, + crawled_at TIMESTAMPTZ DEFAULT NOW() +); + +-- 최신 크롤링 결과 빠른 조회용 인덱스 +CREATE INDEX IF NOT EXISTS idx_crawl_results_site_latest + ON crawl_results(site_id, crawled_at DESC); + +-- AdSense 설정 +CREATE TABLE IF NOT EXISTS adsense_configs ( + id SERIAL PRIMARY KEY, + name VARCHAR(100) NOT NULL, + client_id VARCHAR(100) NOT NULL, + -- 광고 슬롯들 (JSON: {top: "slot1", middle: "slot2", bottom: "slot3"}) + slots JSONB DEFAULT '{}', + is_active BOOLEAN DEFAULT TRUE, + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +); + +-- 사이트별 AdSense 연결 +ALTER TABLE sites ADD COLUMN IF NOT EXISTS adsense_config_id INTEGER REFERENCES adsense_configs(id); + +-- 도메인 매핑 (하나의 사이트를 여러 도메인에서 서비스) +CREATE TABLE IF NOT EXISTS domain_mappings ( + id SERIAL PRIMARY KEY, + domain VARCHAR(255) NOT NULL UNIQUE, + site_id INTEGER NOT NULL REFERENCES sites(id) ON DELETE CASCADE, + adsense_config_id INTEGER REFERENCES adsense_configs(id), + is_active BOOLEAN DEFAULT TRUE, + created_at TIMESTAMPTZ DEFAULT NOW() +); + +-- 크롤링 로그 +CREATE TABLE IF NOT EXISTS crawl_logs ( + id SERIAL PRIMARY KEY, + site_id INTEGER NOT NULL REFERENCES sites(id) ON DELETE CASCADE, + action VARCHAR(50) NOT NULL, + message TEXT, + created_at TIMESTAMPTZ DEFAULT NOW() +); + +-- 최신 결과만 빠르게 가져오는 뷰 +CREATE OR REPLACE VIEW latest_crawl_results AS +SELECT DISTINCT ON (site_id) + cr.*, + s.name AS site_name, + s.slug AS site_slug +FROM crawl_results cr +JOIN sites s ON s.id = cr.site_id +WHERE cr.status = 'success' +ORDER BY site_id, crawled_at DESC; diff --git a/src/migrations/run.js b/src/migrations/run.js new file mode 100644 index 0000000..fac16d4 --- /dev/null +++ b/src/migrations/run.js @@ -0,0 +1,18 @@ +require('dotenv').config(); +const fs = require('fs'); +const path = require('path'); +const db = require('../db'); + +async function migrate() { + const sql = fs.readFileSync(path.join(__dirname, 'init.sql'), 'utf-8'); + try { + await db.query(sql); + console.log('[MIGRATE] 마이그레이션 완료'); + process.exit(0); + } catch (err) { + console.error('[MIGRATE] 실패:', err.message); + process.exit(1); + } +} + +migrate(); diff --git a/src/routes/api.js b/src/routes/api.js new file mode 100644 index 0000000..6754dee --- /dev/null +++ b/src/routes/api.js @@ -0,0 +1,240 @@ +const express = require('express'); +const router = express.Router(); +const db = require('../db'); +const { crawlSite } = require('../services/crawler'); +const { updateSchedule, getActiveJobs } = require('../services/scheduler'); + +// ===================== 사이트 CRUD ===================== + +// 목록 +router.get('/sites', async (req, res) => { + try { + const { rows } = await db.query(` + SELECT s.*, ac.name AS adsense_name, + (SELECT COUNT(*) FROM crawl_results WHERE site_id = s.id) AS crawl_count + FROM sites s + LEFT JOIN adsense_configs ac ON ac.id = s.adsense_config_id + ORDER BY s.id + `); + res.json(rows); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +// 단건 조회 +router.get('/sites/:id', async (req, res) => { + try { + const { rows } = await db.query('SELECT * FROM sites WHERE id = $1', [req.params.id]); + if (rows.length === 0) return res.status(404).json({ error: 'Not found' }); + res.json(rows[0]); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +// 생성 +router.post('/sites', async (req, res) => { + try { + const { name, url, description, parse_rules, slug, template, adsense_config_id } = req.body; + const { rows } = await db.query( + `INSERT INTO sites (name, url, description, parse_rules, slug, template, adsense_config_id) + VALUES ($1, $2, $3, $4, $5, $6, $7) RETURNING *`, + [name, url, description || '', parse_rules || {}, slug || null, template || 'default', adsense_config_id || null] + ); + res.json(rows[0]); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +// 수정 +router.put('/sites/:id', async (req, res) => { + try { + const { name, url, description, parse_rules, slug, template, adsense_config_id } = req.body; + const { rows } = await db.query( + `UPDATE sites SET name=$1, url=$2, description=$3, parse_rules=$4, + slug=$5, template=$6, adsense_config_id=$7, updated_at=NOW() + WHERE id=$8 RETURNING *`, + [name, url, description, parse_rules, slug || null, template, adsense_config_id || null, req.params.id] + ); + res.json(rows[0]); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +// 삭제 +router.delete('/sites/:id', async (req, res) => { + try { + await db.query('DELETE FROM sites WHERE id = $1', [req.params.id]); + res.json({ success: true }); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +// ===================== 크롤링 ===================== + +// 즉시 크롤링 실행 +router.post('/sites/:id/crawl', async (req, res) => { + try { + const result = await crawlSite(parseInt(req.params.id)); + res.json(result); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +// 크롤링 결과 목록 +router.get('/sites/:id/results', async (req, res) => { + try { + const limit = parseInt(req.query.limit) || 20; + const { rows } = await db.query( + `SELECT id, site_id, status, error_message, crawled_at, + jsonb_array_length(COALESCE(parsed_data->'items', '[]'::jsonb)) AS item_count + FROM crawl_results WHERE site_id = $1 ORDER BY crawled_at DESC LIMIT $2`, + [req.params.id, limit] + ); + res.json(rows); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +// 특정 크롤링 결과 상세 (파싱 데이터) +router.get('/results/:id', async (req, res) => { + try { + const { rows } = await db.query( + 'SELECT * FROM crawl_results WHERE id = $1', + [req.params.id] + ); + if (rows.length === 0) return res.status(404).json({ error: 'Not found' }); + res.json(rows[0]); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +// ===================== 스케줄 ===================== + +// 스케줄 업데이트 +router.put('/sites/:id/schedule', async (req, res) => { + try { + const { cron_schedule, schedule_active } = req.body; + await updateSchedule(parseInt(req.params.id), cron_schedule, schedule_active); + res.json({ success: true }); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +// 활성 스케줄 목록 +router.get('/schedules/active', async (req, res) => { + res.json({ active_site_ids: getActiveJobs() }); +}); + +// ===================== AdSense ===================== + +// 목록 +router.get('/adsense', async (req, res) => { + try { + const { rows } = await db.query('SELECT * FROM adsense_configs ORDER BY id'); + res.json(rows); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +// 생성 +router.post('/adsense', async (req, res) => { + try { + const { name, client_id, slots } = req.body; + const { rows } = await db.query( + `INSERT INTO adsense_configs (name, client_id, slots) VALUES ($1, $2, $3) RETURNING *`, + [name, client_id, slots || {}] + ); + res.json(rows[0]); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +// 수정 +router.put('/adsense/:id', async (req, res) => { + try { + const { name, client_id, slots, is_active } = req.body; + const { rows } = await db.query( + `UPDATE adsense_configs SET name=$1, client_id=$2, slots=$3, is_active=$4, updated_at=NOW() + WHERE id=$5 RETURNING *`, + [name, client_id, slots, is_active, req.params.id] + ); + res.json(rows[0]); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +// 삭제 +router.delete('/adsense/:id', async (req, res) => { + try { + await db.query('DELETE FROM adsense_configs WHERE id = $1', [req.params.id]); + res.json({ success: true }); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +// ===================== 도메인 매핑 ===================== + +router.get('/domains', async (req, res) => { + try { + const { rows } = await db.query(` + SELECT d.*, s.name AS site_name FROM domain_mappings d + LEFT JOIN sites s ON s.id = d.site_id ORDER BY d.id + `); + res.json(rows); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +router.post('/domains', async (req, res) => { + try { + const { domain, site_id, adsense_config_id } = req.body; + const { rows } = await db.query( + `INSERT INTO domain_mappings (domain, site_id, adsense_config_id) VALUES ($1, $2, $3) RETURNING *`, + [domain, site_id, adsense_config_id || null] + ); + res.json(rows[0]); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +router.delete('/domains/:id', async (req, res) => { + try { + await db.query('DELETE FROM domain_mappings WHERE id = $1', [req.params.id]); + res.json({ success: true }); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +// ===================== 로그 ===================== + +router.get('/logs', async (req, res) => { + try { + const limit = parseInt(req.query.limit) || 50; + const { rows } = await db.query(` + SELECT l.*, s.name AS site_name FROM crawl_logs l + LEFT JOIN sites s ON s.id = l.site_id + ORDER BY l.created_at DESC LIMIT $1 + `, [limit]); + res.json(rows); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +module.exports = router; diff --git a/src/routes/public.js b/src/routes/public.js new file mode 100644 index 0000000..5f6fc74 --- /dev/null +++ b/src/routes/public.js @@ -0,0 +1,73 @@ +const express = require('express'); +const router = express.Router(); +const db = require('../db'); + +/** + * 공개 사이트 라우터 + * - slug 기반: /s/torrent-rank → sites.slug = 'torrent-rank'의 최신 rendered_html 반환 + * - 도메인 기반: Host 헤더로 domain_mappings 조회 + */ + +// slug 기반 접근 +router.get('/s/:slug', async (req, res) => { + try { + const { rows } = await db.query(` + SELECT cr.rendered_html + FROM crawl_results cr + JOIN sites s ON s.id = cr.site_id + WHERE s.slug = $1 AND cr.status = 'success' + ORDER BY cr.crawled_at DESC LIMIT 1 + `, [req.params.slug]); + + if (rows.length === 0 || !rows[0].rendered_html) { + return res.status(404).send('

페이지를 찾을 수 없습니다

아직 크롤링 데이터가 없습니다.

'); + } + + res.type('html').send(rows[0].rendered_html); + } catch (err) { + res.status(500).send('Internal Server Error'); + } +}); + +// 도메인 기반 접근 (미들웨어로 사용) +async function domainRouter(req, res, next) { + // 관리자 경로는 무시 + if (req.path.startsWith('/admin') || req.path.startsWith('/api') || req.path.startsWith('/s/')) { + return next(); + } + + // 루트 경로일 때만 도메인 매핑 처리 + if (req.path !== '/' && req.path !== '/index.html') { + return next(); + } + + const host = req.hostname; + + try { + const { rows } = await db.query(` + SELECT dm.site_id, dm.adsense_config_id + FROM domain_mappings dm + WHERE dm.domain = $1 AND dm.is_active = TRUE + `, [host]); + + if (rows.length === 0) return next(); + + const siteId = rows[0].site_id; + + const result = await db.query(` + SELECT rendered_html FROM crawl_results + WHERE site_id = $1 AND status = 'success' + ORDER BY crawled_at DESC LIMIT 1 + `, [siteId]); + + if (result.rows.length === 0 || !result.rows[0].rendered_html) { + return res.status(404).send('

아직 데이터가 없습니다

'); + } + + res.type('html').send(result.rows[0].rendered_html); + } catch (err) { + next(); + } +} + +module.exports = { router, domainRouter }; diff --git a/src/services/crawler.js b/src/services/crawler.js new file mode 100644 index 0000000..c2f27e2 --- /dev/null +++ b/src/services/crawler.js @@ -0,0 +1,333 @@ +const axios = require('axios'); +const cheerio = require('cheerio'); +const https = require('https'); +const db = require('../db'); + +// SSL 인증서 무시 (자체 서명 등) +const axiosInstance = axios.create({ + httpsAgent: new https.Agent({ rejectUnauthorized: false }), + timeout: 30000, + headers: { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7', + }, +}); + +/** + * 사이트를 크롤링하고 DB에 저장 + */ +async function crawlSite(siteId) { + // 사이트 정보 조회 + const { rows } = await db.query('SELECT * FROM sites WHERE id = $1', [siteId]); + if (rows.length === 0) throw new Error(`Site ${siteId} not found`); + const site = rows[0]; + + await logCrawl(siteId, 'crawl_start', `크롤링 시작: ${site.url}`); + + try { + // 1. HTML 가져오기 + const response = await axiosInstance.get(site.url); + const rawHtml = response.data; + + // 2. 파싱 규칙에 따라 데이터 추출 + const parseRules = site.parse_rules || {}; + const parsedData = parseHtml(rawHtml, parseRules); + + // 3. 렌더링용 HTML 생성 + const adsenseConfig = await getAdsenseConfig(site.adsense_config_id); + const renderedHtml = renderPublicPage(site, parsedData, adsenseConfig); + + // 4. DB 저장 + await db.query( + `INSERT INTO crawl_results (site_id, raw_html, parsed_data, rendered_html, status) + VALUES ($1, $2, $3, $4, 'success')`, + [siteId, rawHtml, JSON.stringify(parsedData), renderedHtml] + ); + + // 5. 사이트 최종 크롤링 시간 업데이트 + await db.query( + 'UPDATE sites SET last_crawled_at = NOW(), updated_at = NOW() WHERE id = $1', + [siteId] + ); + + await logCrawl(siteId, 'crawl_success', `크롤링 완료. ${parsedData.items?.length || 0}개 항목 추출`); + + return { success: true, itemCount: parsedData.items?.length || 0 }; + + } catch (err) { + // 에러 기록 + await db.query( + `INSERT INTO crawl_results (site_id, status, error_message) + VALUES ($1, 'error', $2)`, + [siteId, err.message] + ); + await logCrawl(siteId, 'crawl_error', err.message); + throw err; + } +} + +/** + * HTML 파싱 - parse_rules에 따라 데이터 추출 + * + * parse_rules 형식: + * { + * "container": "table.easy-table tbody tr", // 반복 항목 컨테이너 CSS 셀렉터 + * "fields": { + * "rank": { "selector": "td:nth-child(1)", "type": "text" }, + * "name": { "selector": "td:nth-child(2)", "type": "text" }, + * "url": { "selector": "td:nth-child(3) a", "type": "attr", "attr": "href" }, + * "url_text": { "selector": "td:nth-child(3)", "type": "text" }, + * "features": { "selector": "td:nth-child(4)", "type": "text" } + * }, + * "meta": { + * "title": { "selector": "h1.entry-title", "type": "text" }, + * "date": { "selector": "time.entry-date", "type": "attr", "attr": "datetime" } + * } + * } + */ +function parseHtml(html, rules) { + const $ = cheerio.load(html); + const result = { items: [], meta: {} }; + + // 메타 정보 추출 + if (rules.meta) { + for (const [key, rule] of Object.entries(rules.meta)) { + result.meta[key] = extractValue($, $(rule.selector).first(), rule); + } + } + + // 항목 추출 + if (rules.container && rules.fields) { + $(rules.container).each((idx, el) => { + const item = {}; + let hasData = false; + + for (const [key, rule] of Object.entries(rules.fields)) { + const target = $(el).find(rule.selector).first(); + item[key] = extractValue($, target, rule); + if (item[key]) hasData = true; + } + + // 비활성(취소선) 체크 + const rowHtml = $(el).html() || ''; + item._inactive = rowHtml.includes(''); + + if (hasData) { + item._index = idx + 1; + result.items.push(item); + } + }); + } + + // 규칙이 없으면 기본 정보만 + if (!rules.container) { + result.meta.title = $('title').text().trim(); + result.meta.description = $('meta[name="description"]').attr('content') || ''; + result.meta.rawTextPreview = $('body').text().trim().substring(0, 500); + } + + return result; +} + +function extractValue($, el, rule) { + if (!el || el.length === 0) return ''; + switch (rule.type) { + case 'attr': + return el.attr(rule.attr) || ''; + case 'html': + return el.html() || ''; + case 'text': + default: + return el.text().trim(); + } +} + +/** + * 공개 페이지용 HTML 렌더링 + */ +function renderPublicPage(site, parsedData, adsenseConfig) { + const items = parsedData.items || []; + const meta = parsedData.meta || {}; + const ads = adsenseConfig || {}; + const now = new Date().toLocaleString('ko-KR', { timeZone: 'Asia/Seoul' }); + + const activeItems = items.filter(i => !i._inactive); + const inactiveItems = items.filter(i => i._inactive); + + // 순위 카드 HTML 생성 + let cardsHtml = ''; + activeItems.forEach((item, idx) => { + const rank = item.rank || item._index || (idx + 1); + const rankClass = rank == 1 ? 'r1' : rank == 2 ? 'r2' : rank == 3 ? 'r3' : ''; + + // 별점 + 태그 + const stars = (item.features || '').match(/★/g); + const starCount = stars ? stars.length : 0; + const tagText = (item.features || '').replace(/★/g, '').trim(); + const starsHtml = starCount > 0 ? `${'★'.repeat(starCount)}` : ''; + const tagHtml = tagText ? `${escapeHtml(tagText)}` : ''; + + cardsHtml += ` + +
${rank}
+
+
${escapeHtml(item.name || '')}
+
${escapeHtml(item.url_text || item.url || '')}
+ ${(starsHtml || tagHtml) ? `
${starsHtml}${tagHtml}
` : ''} +
+ +
`; + + // 5번째 뒤 중간 광고 + if (idx === 4 && ads.client_id) { + cardsHtml += renderAdBlock(ads.client_id, ads.slots?.middle || ''); + } + }); + + // 비활성 사이트 + let inactiveHtml = ''; + if (inactiveItems.length > 0) { + inactiveHtml = ` +
+

접속 불가 사이트

+ 확인 필요 +
+
+ ${inactiveItems.map(item => ` +
+
+
+
${escapeHtml(item.name || '')}
+
${escapeHtml(item.url_text || '')}
+
+ +
+ `).join('')} +
`; + } + + const adsenseScript = ads.client_id + ? `` + : ''; + + const topAd = ads.client_id ? renderAdBlock(ads.client_id, ads.slots?.top || '') : ''; + const bottomAd = ads.client_id ? renderAdBlock(ads.client_id, ads.slots?.bottom || '') : ''; + + return ` + + + + + +${escapeHtml(site.name || 'Torrent Rank')} +${adsenseScript} + + + + +
+

${escapeHtml(site.name)}${escapeHtml(meta.title?.match(/ver\.?([\d.]+)/)?.[1] || now.split(' ')[0])}

+

업데이트: ${now}

+
+ ${activeItems.length}개 사이트 + 비회원제 (가입 불필요) + 자동 갱신 +
+
+
+ ${topAd} +
+

⚠️ 이용 시 주의사항

+

토렌트 다운로드 시 동시에 업로드에도 참여하게 됩니다. 저작권이 있는 파일을 다운로드할 경우 법적 책임이 발생할 수 있으며, 압축파일(*.zip) 형태의 토렌트는 악성코드 포함 가능성이 있으니 주의하세요.

+
+
+

추천 토렌트 사이트 순위

+ 비회원제 +
+
+ ${cardsHtml || '

아직 수집된 데이터가 없습니다.

'} +
+ ${inactiveHtml} + ${bottomAd} +
+
+

© ${new Date().getFullYear()} ${escapeHtml(site.name)}

+

본 사이트는 정보 제공 목적이며, 불법 다운로드를 조장하지 않습니다.

+
+ +`; +} + +function renderAdBlock(clientId, slotId) { + if (!clientId) return ''; + return ` +
+
Advertisement
+ + +
`; +} + +function escapeHtml(str) { + if (!str) return ''; + return String(str) + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/"/g, '"') + .replace(/'/g, '''); +} + +async function getAdsenseConfig(configId) { + if (!configId) return null; + const { rows } = await db.query('SELECT * FROM adsense_configs WHERE id = $1 AND is_active = TRUE', [configId]); + return rows[0] || null; +} + +async function logCrawl(siteId, action, message) { + await db.query( + 'INSERT INTO crawl_logs (site_id, action, message) VALUES ($1, $2, $3)', + [siteId, action, message] + ); +} + +module.exports = { crawlSite, parseHtml }; diff --git a/src/services/scheduler.js b/src/services/scheduler.js new file mode 100644 index 0000000..3327555 --- /dev/null +++ b/src/services/scheduler.js @@ -0,0 +1,88 @@ +const cron = require('node-cron'); +const db = require('../db'); +const { crawlSite } = require('./crawler'); + +// 활성 스케줄 저장 (siteId -> cronJob) +const activeJobs = new Map(); + +/** + * DB에서 스케줄 설정된 사이트들을 로드하고 크론 등록 + */ +async function initScheduler() { + console.log('[SCHEDULER] 스케줄러 초기화...'); + + const { rows } = await db.query( + 'SELECT id, name, cron_schedule FROM sites WHERE schedule_active = TRUE AND cron_schedule != \'\'' + ); + + for (const site of rows) { + registerJob(site.id, site.name, site.cron_schedule); + } + + console.log(`[SCHEDULER] ${rows.length}개 스케줄 등록 완료`); +} + +/** + * 사이트 크론잡 등록 + */ +function registerJob(siteId, siteName, cronExpression) { + // 기존 잡 제거 + removeJob(siteId); + + if (!cron.validate(cronExpression)) { + console.error(`[SCHEDULER] 잘못된 크론 표현식: ${cronExpression} (site: ${siteName})`); + return false; + } + + const job = cron.schedule(cronExpression, async () => { + console.log(`[SCHEDULER] 자동 크롤링 시작: ${siteName} (ID: ${siteId})`); + try { + await crawlSite(siteId); + console.log(`[SCHEDULER] 자동 크롤링 완료: ${siteName}`); + } catch (err) { + console.error(`[SCHEDULER] 자동 크롤링 실패: ${siteName}`, err.message); + } + }, { + timezone: 'Asia/Seoul', + }); + + activeJobs.set(siteId, job); + console.log(`[SCHEDULER] 등록: ${siteName} (${cronExpression})`); + return true; +} + +/** + * 크론잡 제거 + */ +function removeJob(siteId) { + if (activeJobs.has(siteId)) { + activeJobs.get(siteId).stop(); + activeJobs.delete(siteId); + } +} + +/** + * 스케줄 업데이트 (관리자 페이지에서 호출) + */ +async function updateSchedule(siteId, cronExpression, active) { + await db.query( + 'UPDATE sites SET cron_schedule = $1, schedule_active = $2, updated_at = NOW() WHERE id = $3', + [cronExpression, active, siteId] + ); + + if (active && cronExpression) { + const { rows } = await db.query('SELECT name FROM sites WHERE id = $1', [siteId]); + registerJob(siteId, rows[0]?.name || '', cronExpression); + } else { + removeJob(siteId); + } +} + +/** + * 현재 활성 스케줄 목록 + */ +function getActiveJobs() { + return Array.from(activeJobs.keys()); +} + +module.exports = { initScheduler, registerJob, removeJob, updateSchedule, getActiveJobs }; diff --git a/stop-local.bat b/stop-local.bat new file mode 100644 index 0000000..23450a1 --- /dev/null +++ b/stop-local.bat @@ -0,0 +1,6 @@ +@echo off +cd /d "%~dp0" +echo 컨테이너 중지 중... +docker compose -f docker-compose.dev.yml down +echo 완료. +pause diff --git a/views/admin/adsense.ejs b/views/admin/adsense.ejs new file mode 100644 index 0000000..7c9e782 --- /dev/null +++ b/views/admin/adsense.ejs @@ -0,0 +1,115 @@ +<%- include('layout', { page: 'adsense', pageTitle: 'AdSense 관리', body: ` + +
+
+

AdSense 설정 목록

+ +
+ + + +
ID이름Client ID상단 슬롯중간 슬롯하단 슬롯상태액션
+
+ + + + +` }) %> diff --git a/views/admin/dashboard.ejs b/views/admin/dashboard.ejs new file mode 100644 index 0000000..5a3b483 --- /dev/null +++ b/views/admin/dashboard.ejs @@ -0,0 +1,56 @@ +<%- include('layout', { page: 'dashboard', pageTitle: '대시보드', body: ` + +
+
-
등록된 사이트
+
-
스케줄 활성
+
-
총 크롤링 횟수
+
-
AdSense 설정
+
+ +
+
+

사이트 현황

+ 사이트 관리 → +
+ + + +
사이트명URL스케줄마지막 크롤링상태공개 URL
+
+ +
+

최근 로그

+ + + +
시간사이트액션메시지
+
+ + +` }) %> diff --git a/views/admin/domains.ejs b/views/admin/domains.ejs new file mode 100644 index 0000000..8510e35 --- /dev/null +++ b/views/admin/domains.ejs @@ -0,0 +1,82 @@ +<%- include('layout', { page: 'domains', pageTitle: '도메인 매핑', body: ` + +
+
+

도메인 매핑

+ +
+

+ 도메인을 특정 사이트에 연결하면, 해당 도메인으로 접속 시 크롤링 결과가 자동으로 표시됩니다.
+ 슬러그 기반 접근도 가능합니다: /s/{slug} +

+ + + +
도메인연결 사이트상태등록일액션
+
+ + + + +` }) %> diff --git a/views/admin/layout.ejs b/views/admin/layout.ejs new file mode 100644 index 0000000..f0f8748 --- /dev/null +++ b/views/admin/layout.ejs @@ -0,0 +1,142 @@ + + + + + +<%= typeof pageTitle !== 'undefined' ? pageTitle : 'Crawl Manager' %> + + + + +
+
+

<%= typeof pageTitle !== 'undefined' ? pageTitle : '' %>

+ Crawl Manager v1.0 +
+
+ <%- body %> +
+
+
+ + + diff --git a/views/admin/logs.ejs b/views/admin/logs.ejs new file mode 100644 index 0000000..d2068e8 --- /dev/null +++ b/views/admin/logs.ejs @@ -0,0 +1,27 @@ +<%- include('layout', { page: 'logs', pageTitle: '크롤링 로그', body: ` + +
+
+

최근 로그

+ +
+ + + +
시간사이트액션메시지
+
+ + +` }) %> diff --git a/views/admin/site-detail.ejs b/views/admin/site-detail.ejs new file mode 100644 index 0000000..c82f7d2 --- /dev/null +++ b/views/admin/site-detail.ejs @@ -0,0 +1,135 @@ +<%- include('layout', { page: 'sites', pageTitle: '사이트 상세', body: ` + +
+ + +
+

크롤링 스케줄

+
+
+ + +
+ 5분마다 + 매시간 + 6시간마다 + 매일 06시 + 하루 3회 + 매일 자정 + 매주 월요일 +
+
+
+ +
+ +
+
+
+ + +
+ + +
+

크롤링 결과

+ + + +
ID시간상태항목 수에러액션
+
+ + + + + +` }) %> diff --git a/views/admin/sites.ejs b/views/admin/sites.ejs new file mode 100644 index 0000000..e3e8d37 --- /dev/null +++ b/views/admin/sites.ejs @@ -0,0 +1,183 @@ +<%- include('layout', { page: 'sites', pageTitle: '사이트 관리', body: ` + +
+
+

크롤링 대상 사이트

+ +
+ + + +
ID사이트명URL슬러그스케줄마지막 크롤링액션
+
+ + + + + +` }) %>