diff --git a/deploy_remote.py b/deploy_remote.py
new file mode 100644
index 0000000..e4e64b7
--- /dev/null
+++ b/deploy_remote.py
@@ -0,0 +1,54 @@
+import paramiko, sys, io
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
+
+ssh = paramiko.SSHClient()
+ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+ssh.connect('211.115.91.140', port=12991, username='three', password='qlalfqjsgh11', timeout=15)
+
+def run(cmd, t=60):
+ stdin, stdout, stderr = ssh.exec_command(cmd, timeout=t)
+ o = stdout.read().decode('utf-8', errors='replace')
+ e = stderr.read().decode('utf-8', errors='replace')
+ if o.strip(): print(o.strip()[:5000])
+ if e.strip(): print('[ERR]', e.strip()[:500])
+
+# 1. 현재 사이트 이름 확인 + 한글 깨짐 원인 분석
+print('=== Check sites ===')
+run('docker exec crawl-manager-db psql -U crawler -d crawler -c "SELECT id, name, slug FROM sites ORDER BY id;"')
+
+# 2. 크롤링된 HTML에서 한글 확인
+print('\n=== Check encoding in crawl results ===')
+
+script = r"""
+const db = require('./src/db');
+(async () => {
+ await db.waitForDB();
+ // 각 사이트 최신 렌더링 HTML의 title 태그 확인
+ const sites = await db.query('SELECT id, name FROM sites ORDER BY id');
+ for (const s of sites.rows) {
+ const r = await db.query('SELECT rendered_html FROM crawl_results WHERE site_id=$1 AND status=$2 ORDER BY crawled_at DESC LIMIT 1', [s.id, 'success']);
+ if (r.rows.length && r.rows[0].rendered_html) {
+ var html = r.rows[0].rendered_html;
+ var titleMatch = html.match(/
(.*?)<\/title>/);
+ var charsetMatch = html.match(/{const ct=r.headers['content-type']||'';console.log('Content-Type:',ct);const buf=Buffer.from(r.data);console.log('First 500 bytes:',buf.toString('utf-8').substring(0,500));console.log('EUC-KR test:');try{const iconv=require('iconv-lite');console.log(iconv.decode(buf,'euc-kr').substring(0,300));}catch(e){console.log('no iconv-lite');}}).catch(e=>console.log(e.message))\"")
+
+ssh.close()
diff --git a/package.json b/package.json
index 3b072d9..5b02436 100644
--- a/package.json
+++ b/package.json
@@ -17,6 +17,7 @@
"ejs": "^3.1.10",
"dotenv": "^16.4.0",
"https-proxy-agent": "^7.0.0",
- "cookie-parser": "^1.4.6"
+ "cookie-parser": "^1.4.6",
+ "iconv-lite": "^0.6.3"
}
}
diff --git a/src/routes/api.js b/src/routes/api.js
index 12b5df6..0fedb16 100644
--- a/src/routes/api.js
+++ b/src/routes/api.js
@@ -82,16 +82,31 @@ router.post('/fetch-page', async (req, res) => {
if (!url) return res.status(400).json({ error: 'URL is required' });
const axios = require('axios');
const https = require('https');
+ const iconv = require('iconv-lite');
const response = await axios.get(url, {
httpsAgent: new https.Agent({ rejectUnauthorized: false }),
timeout: 30000,
+ responseType: 'arraybuffer',
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
},
maxRedirects: 5,
});
- res.json({ html: response.data, finalUrl: response.request?.res?.responseUrl || url });
+ // 인코딩 감지 + 변환
+ const ct = response.headers['content-type'] || '';
+ const ctMatch = ct.match(/charset=([^\s;]+)/i);
+ let html;
+ if (ctMatch && ctMatch[1].toLowerCase().replace(/['"]/g, '') !== 'utf-8' && iconv.encodingExists(ctMatch[1])) {
+ html = iconv.decode(Buffer.from(response.data), ctMatch[1]);
+ } else {
+ html = Buffer.from(response.data).toString('utf-8');
+ const metaMatch = html.match(/]+charset=["']?([^"'\s;>]+)/i);
+ if (metaMatch && metaMatch[1].toLowerCase() !== 'utf-8' && iconv.encodingExists(metaMatch[1])) {
+ html = iconv.decode(Buffer.from(response.data), metaMatch[1]);
+ }
+ }
+ res.json({ html, finalUrl: response.request?.res?.responseUrl || url });
} catch (err) {
res.status(500).json({ error: err.message });
}
diff --git a/src/services/crawler.js b/src/services/crawler.js
index fb949a1..a7e2db2 100644
--- a/src/services/crawler.js
+++ b/src/services/crawler.js
@@ -1,18 +1,44 @@
const axios = require('axios');
const cheerio = require('cheerio');
const https = require('https');
+const iconv = require('iconv-lite');
const db = require('../db');
// SSL 인증서 무시 (자체 서명 등)
const axiosInstance = axios.create({
httpsAgent: new https.Agent({ rejectUnauthorized: false }),
timeout: 30000,
+ responseType: 'arraybuffer',
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
},
});
+/**
+ * 응답 버퍼를 인코딩 감지 후 UTF-8 문자열로 변환
+ */
+function decodeResponse(buffer, contentType) {
+ // Content-Type 헤더에서 charset 감지
+ const ctMatch = (contentType || '').match(/charset=([^\s;]+)/i);
+ if (ctMatch) {
+ const charset = ctMatch[1].toLowerCase().replace(/['"]/g, '');
+ if (charset !== 'utf-8' && charset !== 'utf8' && iconv.encodingExists(charset)) {
+ return iconv.decode(Buffer.from(buffer), charset);
+ }
+ }
+ const str = Buffer.from(buffer).toString('utf-8');
+ // HTML meta charset 감지
+ const metaMatch = str.match(/]+charset=["']?([^"'\s;>]+)/i);
+ if (metaMatch) {
+ const charset = metaMatch[1].toLowerCase();
+ if (charset !== 'utf-8' && charset !== 'utf8' && iconv.encodingExists(charset)) {
+ return iconv.decode(Buffer.from(buffer), charset);
+ }
+ }
+ return str;
+}
+
/**
* 사이트를 크롤링하고 DB에 저장
*/
@@ -32,7 +58,8 @@ async function crawlSite(siteId) {
// 디스커버리: 목록 페이지에서 최신 글 URL을 자동 탐색
await logCrawl(siteId, 'discovery', `목록 페이지에서 최신 글 탐색: ${site.url}`);
const listResponse = await axiosInstance.get(site.url);
- const $list = cheerio.load(listResponse.data);
+ const listHtml = decodeResponse(listResponse.data, listResponse.headers['content-type']);
+ const $list = cheerio.load(listHtml);
const selector = parseRules.discovery.link_selector || 'a.read-more, a.more-link, .entry-title a, article a';
const linkEl = $list(selector).first();
@@ -49,9 +76,9 @@ async function crawlSite(siteId) {
}
}
- // 2. HTML 가져오기
+ // 2. HTML 가져오기 (인코딩 자동 감지)
const response = await axiosInstance.get(targetUrl);
- const rawHtml = response.data;
+ const rawHtml = decodeResponse(response.data, response.headers['content-type']);
// 3. 파싱 규칙에 따라 데이터 추출
const parsedData = parseHtml(rawHtml, parseRules);