d9e0a5b3f4
- iconv-lite 추가 - 크롤러/API 모두 Content-Type charset 자동 감지 - HTML meta charset 폴백 감지
55 lines
2.6 KiB
Python
55 lines
2.6 KiB
Python
import paramiko, sys, io
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
|
|
|
|
ssh = paramiko.SSHClient()
|
|
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
ssh.connect('211.115.91.140', port=12991, username='three', password='qlalfqjsgh11', timeout=15)
|
|
|
|
def run(cmd, t=60):
|
|
stdin, stdout, stderr = ssh.exec_command(cmd, timeout=t)
|
|
o = stdout.read().decode('utf-8', errors='replace')
|
|
e = stderr.read().decode('utf-8', errors='replace')
|
|
if o.strip(): print(o.strip()[:5000])
|
|
if e.strip(): print('[ERR]', e.strip()[:500])
|
|
|
|
# 1. 현재 사이트 이름 확인 + 한글 깨짐 원인 분석
|
|
print('=== Check sites ===')
|
|
run('docker exec crawl-manager-db psql -U crawler -d crawler -c "SELECT id, name, slug FROM sites ORDER BY id;"')
|
|
|
|
# 2. 크롤링된 HTML에서 한글 확인
|
|
print('\n=== Check encoding in crawl results ===')
|
|
|
|
script = r"""
|
|
const db = require('./src/db');
|
|
(async () => {
|
|
await db.waitForDB();
|
|
// 각 사이트 최신 렌더링 HTML의 title 태그 확인
|
|
const sites = await db.query('SELECT id, name FROM sites ORDER BY id');
|
|
for (const s of sites.rows) {
|
|
const r = await db.query('SELECT rendered_html FROM crawl_results WHERE site_id=$1 AND status=$2 ORDER BY crawled_at DESC LIMIT 1', [s.id, 'success']);
|
|
if (r.rows.length && r.rows[0].rendered_html) {
|
|
var html = r.rows[0].rendered_html;
|
|
var titleMatch = html.match(/<title>(.*?)<\/title>/);
|
|
var charsetMatch = html.match(/<meta charset="([^"]+)"/i);
|
|
console.log('Site', s.id, s.name, '| charset:', charsetMatch?charsetMatch[1]:'none', '| title:', titleMatch?titleMatch[1]:'none');
|
|
} else {
|
|
console.log('Site', s.id, s.name, '| NO RESULT');
|
|
}
|
|
}
|
|
process.exit(0);
|
|
})();
|
|
"""
|
|
|
|
sftp = ssh.open_sftp()
|
|
with sftp.file('/tmp/check.js', 'w') as f:
|
|
f.write(script)
|
|
sftp.close()
|
|
run('docker cp /tmp/check.js crawl-manager:/app/check.js')
|
|
run('docker exec crawl-manager node /app/check.js')
|
|
|
|
# 3. 원본 사이트 인코딩 확인
|
|
print('\n=== Original site encoding ===')
|
|
run("docker exec crawl-manager node -e \"const ax=require('axios');const https=require('https');ax.get('https://xn--ph1bph0az41x.life/',{httpsAgent:new https.Agent({rejectUnauthorized:false}),responseType:'arraybuffer',timeout:15000}).then(r=>{const ct=r.headers['content-type']||'';console.log('Content-Type:',ct);const buf=Buffer.from(r.data);console.log('First 500 bytes:',buf.toString('utf-8').substring(0,500));console.log('EUC-KR test:');try{const iconv=require('iconv-lite');console.log(iconv.decode(buf,'euc-kr').substring(0,300));}catch(e){console.log('no iconv-lite');}}).catch(e=>console.log(e.message))\"")
|
|
|
|
ssh.close()
|