diff --git a/deploy_remote.py b/deploy_remote.py new file mode 100644 index 0000000..e4e64b7 --- /dev/null +++ b/deploy_remote.py @@ -0,0 +1,54 @@ +import paramiko, sys, io +sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') + +ssh = paramiko.SSHClient() +ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) +ssh.connect('211.115.91.140', port=12991, username='three', password='qlalfqjsgh11', timeout=15) + +def run(cmd, t=60): + stdin, stdout, stderr = ssh.exec_command(cmd, timeout=t) + o = stdout.read().decode('utf-8', errors='replace') + e = stderr.read().decode('utf-8', errors='replace') + if o.strip(): print(o.strip()[:5000]) + if e.strip(): print('[ERR]', e.strip()[:500]) + +# 1. 현재 사이트 이름 확인 + 한글 깨짐 원인 분석 +print('=== Check sites ===') +run('docker exec crawl-manager-db psql -U crawler -d crawler -c "SELECT id, name, slug FROM sites ORDER BY id;"') + +# 2. 크롤링된 HTML에서 한글 확인 +print('\n=== Check encoding in crawl results ===') + +script = r""" +const db = require('./src/db'); +(async () => { + await db.waitForDB(); + // 각 사이트 최신 렌더링 HTML의 title 태그 확인 + const sites = await db.query('SELECT id, name FROM sites ORDER BY id'); + for (const s of sites.rows) { + const r = await db.query('SELECT rendered_html FROM crawl_results WHERE site_id=$1 AND status=$2 ORDER BY crawled_at DESC LIMIT 1', [s.id, 'success']); + if (r.rows.length && r.rows[0].rendered_html) { + var html = r.rows[0].rendered_html; + var titleMatch = html.match(/(.*?)<\/title>/); + var charsetMatch = html.match(/<meta charset="([^"]+)"/i); + console.log('Site', s.id, s.name, '| charset:', charsetMatch?charsetMatch[1]:'none', '| title:', titleMatch?titleMatch[1]:'none'); + } else { + console.log('Site', s.id, s.name, '| NO RESULT'); + } + } + process.exit(0); +})(); +""" + +sftp = ssh.open_sftp() +with sftp.file('/tmp/check.js', 'w') as f: + f.write(script) +sftp.close() +run('docker cp /tmp/check.js crawl-manager:/app/check.js') +run('docker exec crawl-manager node /app/check.js') + +# 3. 원본 사이트 인코딩 확인 +print('\n=== Original site encoding ===') +run("docker exec crawl-manager node -e \"const ax=require('axios');const https=require('https');ax.get('https://xn--ph1bph0az41x.life/',{httpsAgent:new https.Agent({rejectUnauthorized:false}),responseType:'arraybuffer',timeout:15000}).then(r=>{const ct=r.headers['content-type']||'';console.log('Content-Type:',ct);const buf=Buffer.from(r.data);console.log('First 500 bytes:',buf.toString('utf-8').substring(0,500));console.log('EUC-KR test:');try{const iconv=require('iconv-lite');console.log(iconv.decode(buf,'euc-kr').substring(0,300));}catch(e){console.log('no iconv-lite');}}).catch(e=>console.log(e.message))\"") + +ssh.close() diff --git a/package.json b/package.json index 3b072d9..5b02436 100644 --- a/package.json +++ b/package.json @@ -17,6 +17,7 @@ "ejs": "^3.1.10", "dotenv": "^16.4.0", "https-proxy-agent": "^7.0.0", - "cookie-parser": "^1.4.6" + "cookie-parser": "^1.4.6", + "iconv-lite": "^0.6.3" } } diff --git a/src/routes/api.js b/src/routes/api.js index 12b5df6..0fedb16 100644 --- a/src/routes/api.js +++ b/src/routes/api.js @@ -82,16 +82,31 @@ router.post('/fetch-page', async (req, res) => { if (!url) return res.status(400).json({ error: 'URL is required' }); const axios = require('axios'); const https = require('https'); + const iconv = require('iconv-lite'); const response = await axios.get(url, { httpsAgent: new https.Agent({ rejectUnauthorized: false }), timeout: 30000, + responseType: 'arraybuffer', headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7', }, maxRedirects: 5, }); - res.json({ html: response.data, finalUrl: response.request?.res?.responseUrl || url }); + // 인코딩 감지 + 변환 + const ct = response.headers['content-type'] || ''; + const ctMatch = ct.match(/charset=([^\s;]+)/i); + let html; + if (ctMatch && ctMatch[1].toLowerCase().replace(/['"]/g, '') !== 'utf-8' && iconv.encodingExists(ctMatch[1])) { + html = iconv.decode(Buffer.from(response.data), ctMatch[1]); + } else { + html = Buffer.from(response.data).toString('utf-8'); + const metaMatch = html.match(/<meta[^>]+charset=["']?([^"'\s;>]+)/i); + if (metaMatch && metaMatch[1].toLowerCase() !== 'utf-8' && iconv.encodingExists(metaMatch[1])) { + html = iconv.decode(Buffer.from(response.data), metaMatch[1]); + } + } + res.json({ html, finalUrl: response.request?.res?.responseUrl || url }); } catch (err) { res.status(500).json({ error: err.message }); } diff --git a/src/services/crawler.js b/src/services/crawler.js index fb949a1..a7e2db2 100644 --- a/src/services/crawler.js +++ b/src/services/crawler.js @@ -1,18 +1,44 @@ const axios = require('axios'); const cheerio = require('cheerio'); const https = require('https'); +const iconv = require('iconv-lite'); const db = require('../db'); // SSL 인증서 무시 (자체 서명 등) const axiosInstance = axios.create({ httpsAgent: new https.Agent({ rejectUnauthorized: false }), timeout: 30000, + responseType: 'arraybuffer', headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7', }, }); +/** + * 응답 버퍼를 인코딩 감지 후 UTF-8 문자열로 변환 + */ +function decodeResponse(buffer, contentType) { + // Content-Type 헤더에서 charset 감지 + const ctMatch = (contentType || '').match(/charset=([^\s;]+)/i); + if (ctMatch) { + const charset = ctMatch[1].toLowerCase().replace(/['"]/g, ''); + if (charset !== 'utf-8' && charset !== 'utf8' && iconv.encodingExists(charset)) { + return iconv.decode(Buffer.from(buffer), charset); + } + } + const str = Buffer.from(buffer).toString('utf-8'); + // HTML meta charset 감지 + const metaMatch = str.match(/<meta[^>]+charset=["']?([^"'\s;>]+)/i); + if (metaMatch) { + const charset = metaMatch[1].toLowerCase(); + if (charset !== 'utf-8' && charset !== 'utf8' && iconv.encodingExists(charset)) { + return iconv.decode(Buffer.from(buffer), charset); + } + } + return str; +} + /** * 사이트를 크롤링하고 DB에 저장 */ @@ -32,7 +58,8 @@ async function crawlSite(siteId) { // 디스커버리: 목록 페이지에서 최신 글 URL을 자동 탐색 await logCrawl(siteId, 'discovery', `목록 페이지에서 최신 글 탐색: ${site.url}`); const listResponse = await axiosInstance.get(site.url); - const $list = cheerio.load(listResponse.data); + const listHtml = decodeResponse(listResponse.data, listResponse.headers['content-type']); + const $list = cheerio.load(listHtml); const selector = parseRules.discovery.link_selector || 'a.read-more, a.more-link, .entry-title a, article a'; const linkEl = $list(selector).first(); @@ -49,9 +76,9 @@ async function crawlSite(siteId) { } } - // 2. HTML 가져오기 + // 2. HTML 가져오기 (인코딩 자동 감지) const response = await axiosInstance.get(targetUrl); - const rawHtml = response.data; + const rawHtml = decodeResponse(response.data, response.headers['content-type']); // 3. 파싱 규칙에 따라 데이터 추출 const parsedData = parseHtml(rawHtml, parseRules);