小说字体反爬破解指南——Python实现woff2字体逆向解析与OCR识别 发表于 2025-03-22 | 更新于 2025-03-23
| 总字数: 264 | 阅读时长: 1分钟 | 浏览量:
技术背景 小说等平台采用动态woff2字体实现文字反爬,传统CSS映射方法已无法满足需求。本文通过Python实现:
使用fontTools解析woff2字体结构
基于FreeTypePen绘制矢量字形
结合Tesseract OCR实现字符识别
构建完整字体映射表
环境准备 pip install fonttools pillow pytesseract
代码 from fontTools.ttLib import TTFontfrom fontTools.pens.freetypePen import FreeTypePenfrom fontTools.misc.transform import Offsetimport osimport pytesseractpytesseract.pytesseract.tesseract_cmd = r'D:\Program Files (x86)\Tesseract-OCR\tesseract.exe' def export_all_glyphs (woff2_path, output_dir="glyph_images" ): font = TTFont(woff2_path, flavor='woff2' ) glyph_set = font.getGlyphSet() os.makedirs(output_dir, exist_ok=True ) ascender = font['OS/2' ].usWinAscent descender = -font['OS/2' ].usWinDescent height = ascender - descender for gname in glyph_set.keys(): if 'notdef' in gname: continue glyph = glyph_set[gname] pen = FreeTypePen(None ) glyph.draw(pen) image = pen.image( width=glyph.width, height=height, transform=Offset(0 , -descender)) text = pytesseract.image_to_string(image, lang='chi_sim' ,config='--psm 6' ) print (gname,'====' ,text) font.close() export_all_glyphs("dc027189e0ba4cd.woff2" )