diff --git a/reader/CMakeLists.txt b/reader/CMakeLists.txt index c5c29bc0..6bd83a0d 100644 --- a/reader/CMakeLists.txt +++ b/reader/CMakeLists.txt @@ -4,7 +4,7 @@ pkg_check_modules(LIBJPEG REQUIRED libjpeg) pkg_check_modules(DDJVU REQUIRED ddjvuapi) if (XPS_SUPPORT_ENABLED) - pkg_check_modules(XPS_DEPS REQUIRED libgxps cairo glib-2.0 gobject-2.0) + pkg_check_modules(XPS_DEPS REQUIRED libgxps cairo glib-2.0 gobject-2.0 freetype2) endif() # 添加位置无关代码编译标志 diff --git a/reader/document/XpsTextExtractor.cpp b/reader/document/XpsTextExtractor.cpp index c6fef568..8bdffa77 100644 --- a/reader/document/XpsTextExtractor.cpp +++ b/reader/document/XpsTextExtractor.cpp @@ -9,8 +9,12 @@ #include #include +#include #include #include +#include +#include +#include #include @@ -25,6 +29,10 @@ extern "C" { #include +#include +#include FT_FREETYPE_H +#include +#include #include } @@ -35,6 +43,343 @@ extern "C" { #pragma pop_macro("signals") #endif +namespace { + +// FreeType 库单例(线程安全) +static FT_Library ftLibrary = nullptr; +static QMutex ftLibraryMutex; +static bool ftLibraryInitialized = false; + +// 字体缓存:key = filePath + fontUri,value = cairo_font_face_t* +struct FontCacheKey { + QString filePath; + QString fontUri; + bool operator==(const FontCacheKey &other) const { + return filePath == other.filePath && fontUri == other.fontUri; + } +}; + +uint qHash(const FontCacheKey &key, uint seed = 0) { + return qHash(key.filePath, seed) ^ qHash(key.fontUri, seed); +} + +// 字体数据包装器,用于保持字体数据在内存中 +struct FontDataWrapper { + QByteArray fontData; + cairo_font_face_t* fontFace; + + FontDataWrapper(const QByteArray &data, cairo_font_face_t* face) + : fontData(data), fontFace(face) { + // cairo_ft_font_face_create_for_ft_face 创建的 fontFace 初始引用计数为 1 + // FontDataWrapper 拥有这个引用,析构时释放 + } + + ~FontDataWrapper() { + if (fontFace) { + cairo_font_face_destroy(fontFace); + } + } +}; + +static QCache fontCache(100); // 缓存最多100个字体 +static QMutex fontCacheMutex; + +// 初始化 FreeType 库 +void initFreeTypeLibrary() { + QMutexLocker locker(&ftLibraryMutex); + if (!ftLibraryInitialized) { + FT_Error error = FT_Init_FreeType(&ftLibrary); + if (error == 0) { + ftLibraryInitialized = true; + } + } +} + +// 解析 GUID 字符串(用于混淆字体) +bool parseGuid(const QString &str, unsigned short guid[16]) { + // GUID 格式:XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX (36 字符) + // 映射字节到 GUID 字符串的位置(基于 libgxps 的实现) + static const int indexes[] = {6, 4, 2, 0, 11, 9, 16, 14, 19, 21, 24, 26, 28, 30, 32, 34}; + + if (str.length() < 36) { + return false; + } + + for (int i = 0; i < 16; i++) { + if (indexes[i] + 1 >= str.length()) { + return false; + } + QChar c1 = str[indexes[i]]; + QChar c2 = str[indexes[i] + 1]; + + bool ok1 = false, ok2 = false; + unsigned hex1 = c1.toLatin1(); + unsigned hex2 = c2.toLatin1(); + + if (hex1 >= '0' && hex1 <= '9') { + hex1 -= '0'; + ok1 = true; + } else if (hex1 >= 'a' && hex1 <= 'f') { + hex1 = hex1 - 'a' + 10; + ok1 = true; + } else if (hex1 >= 'A' && hex1 <= 'F') { + hex1 = hex1 - 'A' + 10; + ok1 = true; + } + + if (hex2 >= '0' && hex2 <= '9') { + hex2 -= '0'; + ok2 = true; + } else if (hex2 >= 'a' && hex2 <= 'f') { + hex2 = hex2 - 'a' + 10; + ok2 = true; + } else if (hex2 >= 'A' && hex2 <= 'F') { + hex2 = hex2 - 'A' + 10; + ok2 = true; + } + + if (!ok1 || !ok2) { + return false; + } + + guid[i] = static_cast(hex1 * 16 + hex2); + } + + return true; +} + +// 处理混淆的字体文件(根据文件名 GUID 进行 XOR 解密) +void deobfuscateFontData(QByteArray &fontData, const QString &fontUri) { + QString baseName = QFileInfo(fontUri).baseName(); + unsigned short guid[16]; + + if (!parseGuid(baseName, guid)) { + return; // 不是 GUID 格式,不需要解密 + } + + if (fontData.size() < 32) { + return; // 字体文件太小 + } + + // Obfuscation: XOR font binary with bytes from guid (font's filename) + static const int mapping[] = {15, 14, 13, 12, 11, 10, 9, 8, 6, 7, 4, 5, 0, 1, 2, 3}; + + for (int i = 0; i < 16; i++) { + fontData[i] = static_cast(static_cast(fontData[i]) ^ guid[mapping[i]]); + if (i + 16 < fontData.size()) { + fontData[i + 16] = static_cast(static_cast(fontData[i + 16]) ^ guid[mapping[i]]); + } + } +} + +// FreeType Cairo key(用于设置用户数据) +static cairo_user_data_key_t ft_cairo_key; + +// 从ZIP文件读取字体文件(匿名命名空间中的独立实现) +QByteArray readFontFromZipLocal(const QString &filePath, const QString &fontUri) { + if (filePath.isEmpty() || fontUri.isEmpty()) { + return QByteArray(); + } + + QString cleanFontUri = fontUri; + while (cleanFontUri.startsWith(QLatin1String("/")) || cleanFontUri.startsWith(QLatin1String("../"))) { + if (cleanFontUri.startsWith(QLatin1String("/"))) { + cleanFontUri = cleanFontUri.mid(1); + } else if (cleanFontUri.startsWith(QLatin1String("../"))) { + cleanFontUri = cleanFontUri.mid(3); + } + } + + struct archive *a = archive_read_new(); + struct archive_entry *entry = nullptr; + + if (!a) { + return QByteArray(); + } + + archive_read_support_format_zip(a); + archive_read_support_format_all(a); + + QByteArray utf8Path = QFile::encodeName(filePath); + int r = archive_read_open_filename(a, utf8Path.constData(), 10240); + if (r != ARCHIVE_OK) { + archive_read_free(a); + return QByteArray(); + } + + QByteArray result; + bool found = false; + + while (archive_read_next_header(a, &entry) == ARCHIVE_OK) { + QString entryPath = QString::fromUtf8(archive_entry_pathname(entry)); + + if (entryPath == fontUri || entryPath == cleanFontUri || + entryPath.endsWith(QLatin1String("/") + cleanFontUri) || + entryPath.contains(cleanFontUri.split(QLatin1String("/")).last())) { + la_int64_t size = archive_entry_size(entry); + if (size > 0 && size < 50 * 1024 * 1024) { + result.resize(static_cast(size)); + la_ssize_t readSize = archive_read_data(a, result.data(), size); + if (readSize == size) { + found = true; + } else { + result.clear(); + } + } + break; + } + archive_read_data_skip(a); + } + + archive_read_free(a); + + if (!found) { + return QByteArray(); + } + + return result; +} + +// 从字体文件加载 cairo_font_face_t(带缓存) +cairo_font_face_t* loadFontFace(const QString &filePath, const QString &fontUri) { + FontCacheKey key{filePath, fontUri}; + + { + QMutexLocker locker(&fontCacheMutex); + FontDataWrapper* cached = fontCache.object(key); + if (cached) { + // 返回缓存的字体(FontDataWrapper 持有引用,确保 fontFace 有效) + return cached->fontFace; + } + } + + // 读取字体文件 + QByteArray fontData = readFontFromZipLocal(filePath, fontUri); + if (fontData.isEmpty()) { + return nullptr; + } + + // 处理混淆字体(直接在原数据上操作) + deobfuscateFontData(fontData, fontUri); + + // 初始化 FreeType + initFreeTypeLibrary(); + if (!ftLibraryInitialized) { + return nullptr; + } + + // 加载字体到 FreeType + FT_Face face = nullptr; + FT_Error error = FT_New_Memory_Face(ftLibrary, + reinterpret_cast(fontData.constData()), + static_cast(fontData.size()), + 0, + &face); + if (error != 0) { + return nullptr; + } + + // 创建 Cairo 字体 + cairo_font_face_t* fontFace = cairo_ft_font_face_create_for_ft_face(face, 0); + if (cairo_font_face_status(fontFace) != CAIRO_STATUS_SUCCESS) { + FT_Done_Face(face); + return nullptr; + } + + // 设置用户数据,以便在销毁时清理 FreeType face + if (cairo_font_face_set_user_data(fontFace, + &ft_cairo_key, + face, + reinterpret_cast(FT_Done_Face)) != CAIRO_STATUS_SUCCESS) { + cairo_font_face_destroy(fontFace); + FT_Done_Face(face); + return nullptr; + } + + // 创建包装器并缓存(保持 fontData 在内存中) + { + QMutexLocker locker(&fontCacheMutex); + FontDataWrapper* wrapper = new FontDataWrapper(fontData, fontFace); + fontCache.insert(key, wrapper); // QCache 会管理 wrapper 的生命周期 + } + + return fontFace; +} + +// 从字体文件获取字符宽度(使用 FreeType + Cairo) +double getCharWidthFromFont(const QString &filePath, const QString &fontUri, + QChar ch, double fontSize) { + cairo_font_face_t* fontFace = loadFontFace(filePath, fontUri); + if (!fontFace) { + return -1.0; // 加载失败,返回 -1 表示需要使用估算值 + } + + // 创建字体矩阵(与 libgxps 一致) + cairo_matrix_t fontMatrix; + cairo_matrix_init_identity(&fontMatrix); + cairo_matrix_scale(&fontMatrix, fontSize, fontSize); + + // 创建 CTM(单位矩阵) + cairo_matrix_t ctm; + cairo_matrix_init_identity(&ctm); + + // 创建字体选项 + cairo_font_options_t* fontOptions = cairo_font_options_create(); + cairo_font_options_set_hint_metrics(fontOptions, CAIRO_HINT_METRICS_OFF); + + // 创建 scaled font + cairo_scaled_font_t* scaledFont = cairo_scaled_font_create(fontFace, + &fontMatrix, + &ctm, + fontOptions); + cairo_font_options_destroy(fontOptions); + + cairo_status_t scaledFontStatus = cairo_scaled_font_status(scaledFont); + if (scaledFontStatus != CAIRO_STATUS_SUCCESS) { + cairo_scaled_font_destroy(scaledFont); + return -1.0; + } + + // 将 Unicode 字符转换为 UTF-8(参考 libgxps 的 glyphs_lookup_index) + QString utf8Str = QString(ch); + QByteArray utf8Bytes = utf8Str.toUtf8(); + const char* utf8 = utf8Bytes.constData(); + + if (utf8Bytes.isEmpty() || *utf8 == '\0') { + cairo_scaled_font_destroy(scaledFont); + return -1.0; + } + + // 使用 cairo_scaled_font_text_to_glyphs 查找 glyph index(参考 libgxps) + cairo_glyph_t* glyphs = nullptr; + int numGlyphs = 0; + + cairo_status_t status = cairo_scaled_font_text_to_glyphs(scaledFont, + 0.0, 0.0, + utf8, + utf8Bytes.length(), + &glyphs, + &numGlyphs, + nullptr, // clusters + nullptr, // num_clusters + nullptr); // cluster_flags + + double width = -1.0; + if (status == CAIRO_STATUS_SUCCESS && numGlyphs > 0 && glyphs) { + // 获取 glyph extents(参考 libgxps 的 cairo_scaled_font_glyph_extents 使用方式) + cairo_text_extents_t extents; + cairo_scaled_font_glyph_extents(scaledFont, glyphs, numGlyphs, &extents); + width = extents.x_advance; // 使用 x_advance 作为字符宽度(与 libgxps 一致) + cairo_glyph_free(glyphs); + } + + cairo_scaled_font_destroy(scaledFont); + + return width; +} + +} // anonymous namespace + namespace deepin_reader { QList XpsTextExtractor::extractWords(const QString &filePath, int pageIndex) @@ -58,25 +403,27 @@ QList XpsTextExtractor::extractWords(const QString &filePath, int pageInde // 计算每个字符的边界框 qreal currentX = glyph.position.x(); qreal baseY = glyph.position.y(); - // 改进字符高度计算:考虑字体的ascent和descent - // 通常ascent约占字体大小的70-80%,descent约占20-30% - // 使用更合理的估算:ascent = fontSize * 0.75, descent = fontSize * 0.25 qreal charAscent = glyph.fontSize * 0.75; qreal charDescent = glyph.fontSize * 0.25; qreal charHeight = charAscent + charDescent; for (int i = 0; i < glyph.text.length(); ++i) { QChar ch = glyph.text.at(i); - // 优先使用Indices中的精确宽度,否则使用估算 double charWidth; - if (i < glyph.charWidths.size() && glyph.charWidths[i] > 0) { + if (i < glyph.charWidths.size() && glyph.charWidths[i] >= 0) { charWidth = glyph.charWidths[i]; } else { - charWidth = estimateCharWidth(ch, glyph.fontSize); + // 尝试从字体文件获取实际宽度 + double actualWidth = getCharWidthFromFont(filePath, glyph.fontUri, ch, glyph.fontSize); + if (actualWidth >= 0) { + charWidth = actualWidth; + } else { + // 如果获取失败,使用估算值 + charWidth = estimateCharWidth(ch, glyph.fontSize); + } } // 创建单个字符的边界框 - // OriginY是基线位置,所以字符顶部 = baseY - charAscent QRectF charBaseRect(currentX, baseY - charAscent, charWidth, charHeight); // 应用变换矩阵 @@ -235,7 +582,7 @@ QList XpsTextExtractor::parseFixedPage(const QByteArr QString transformStr = xml.attributes().value(QLatin1String("RenderTransform")).toString(); if (!transformStr.isEmpty()) { QTransform transform = parseTransformMatrix(transformStr); - transformStack.last() = transformStack.last() * transform; + transformStack.last() = transform * transformStack.last(); } } else if (elementName == QLatin1String("Canvas") || elementName == QLatin1String("Path")) { // 处理嵌套变换 @@ -243,34 +590,116 @@ QList XpsTextExtractor::parseFixedPage(const QByteArr if (!transformStr.isEmpty()) { // 属性形式的RenderTransform QTransform transform = parseTransformMatrix(transformStr); - transformStack.append(transformStack.last() * transform); + transformStack.append(transform * transformStack.last()); } else { // 没有属性形式的RenderTransform,可能需要查找子元素形式 transformStack.append(transformStack.last()); } } else if (elementName == QLatin1String("Canvas.RenderTransform") || elementName == QLatin1String("Path.RenderTransform")) { - // 进入RenderTransform子元素 + // 进入RenderTransform子元素(Canvas和Path) renderTransformDepth++; } else if (elementName == QLatin1String("MatrixTransform")) { - // 检查是否在Canvas.RenderTransform或Path.RenderTransform内部 - // 如果是,解析Matrix属性并应用到当前transformStack + // 检查是否在RenderTransform子元素内部 if (renderTransformDepth > 0 && !transformStack.isEmpty()) { QString matrixStr = xml.attributes().value(QLatin1String("Matrix")).toString(); if (!matrixStr.isEmpty()) { QTransform transform = parseTransformMatrix(matrixStr); - transformStack.last() = transformStack.last() * transform; + transformStack.last() = transform * transformStack.last(); } } } else if (elementName == QLatin1String("Glyphs")) { + // 解析Glyphs元素 + // 根据libgxps,Glyphs支持两种RenderTransform形式: + // 1. 属性形式:RenderTransform="..." + // 2. 子元素形式: QTransform parentTransform = transformStack.isEmpty() ? QTransform() : transformStack.last(); - GlyphInfo glyph = parseGlyphs(xml, parentTransform); - if (!glyph.text.isEmpty()) { - TextRun run; - run.text = glyph.text; - run.boundingBox = glyph.boundingBox; - run.glyphs.append(glyph); - textRuns.append(run); + + // 先读取Glyphs的属性 + QXmlStreamAttributes attrs = xml.attributes(); + QString unicodeString = attrs.value(QLatin1String("UnicodeString")).toString(); + if (!unicodeString.isEmpty() && unicodeString.startsWith(QLatin1String("{}"))) { + unicodeString = unicodeString.mid(2); + } + + // 检查属性形式的RenderTransform + QString transformStr = attrs.value(QLatin1String("RenderTransform")).toString(); + QTransform glyphTransform = parentTransform; + if (!transformStr.isEmpty()) { + QTransform attrTransform = parseTransformMatrix(transformStr); + glyphTransform = attrTransform * glyphTransform; + } + + // 读取Glyphs的其他属性(用于创建GlyphInfo) + bool okX = false, okY = false; + double originX = attrs.value(QLatin1String("OriginX")).toDouble(&okX); + double originY = attrs.value(QLatin1String("OriginY")).toDouble(&okY); + bool okSize = false; + double fontSize = attrs.value(QLatin1String("FontRenderingEmSize")).toDouble(&okSize); + if (!okSize || fontSize <= 0) fontSize = 12.0; + QString fontUri = attrs.value(QLatin1String("FontUri")).toString(); + QString indicesStr = attrs.value(QLatin1String("Indices")).toString(); + + if (okX && okY && !unicodeString.isEmpty()) { + // 处理Glyphs的子元素,查找Glyphs.RenderTransform子元素 + QTransform glyphLocalTransform; // 用于累积Glyphs.RenderTransform子元素的变换 + bool inGlyphsRenderTransform = false; + int glyphDepth = 1; // Glyphs元素的嵌套深度 + + while (!xml.atEnd() && !xml.hasError()) { + QXmlStreamReader::TokenType nextToken = xml.readNext(); + if (nextToken == QXmlStreamReader::StartElement) { + QString childName = xml.name().toString(); + if (childName == QLatin1String("Glyphs")) { + glyphDepth++; // 嵌套的Glyphs + } else if (childName == QLatin1String("Glyphs.RenderTransform")) { + inGlyphsRenderTransform = true; + glyphLocalTransform = QTransform(); // 重置 + } else if (childName == QLatin1String("MatrixTransform") && inGlyphsRenderTransform) { + // 在Glyphs.RenderTransform内部的MatrixTransform + // 注意:Qt的QTransform乘法:A * B 表示先应用B,再应用A + // 所以要累积多个MatrixTransform,应该使用:matrixTransform * glyphLocalTransform + QString matrixStr = xml.attributes().value(QLatin1String("Matrix")).toString(); + if (!matrixStr.isEmpty()) { + QTransform matrixTransform = parseTransformMatrix(matrixStr); + glyphLocalTransform = matrixTransform * glyphLocalTransform; + } + } + } else if (nextToken == QXmlStreamReader::EndElement) { + QString childName = xml.name().toString(); + if (childName == QLatin1String("Glyphs")) { + glyphDepth--; + if (glyphDepth == 0) { + // 当前Glyphs元素结束 + break; + } + } else if (childName == QLatin1String("Glyphs.RenderTransform")) { + // Glyphs.RenderTransform子元素结束,应用累积的变换 + glyphTransform = glyphLocalTransform * glyphTransform; + inGlyphsRenderTransform = false; + } + } + } + + // 创建GlyphInfo + GlyphInfo glyph; + glyph.text = unicodeString; + glyph.position = QPointF(originX, originY); + glyph.fontSize = fontSize; + glyph.fontUri = fontUri; + glyph.transform = glyphTransform; + if (!indicesStr.isEmpty()) { + glyph.charWidths = parseIndices(indicesStr, unicodeString.length(), fontSize); + } + glyph.boundingBox = calculateGlyphBoundingBox(glyph); + + if (!glyph.text.isEmpty()) { + TextRun run; + run.text = glyph.text; + run.boundingBox = glyph.boundingBox; + run.glyphs.append(glyph); + textRuns.append(run); + } } } } else if (token == QXmlStreamReader::EndElement) { @@ -305,11 +734,16 @@ XpsTextExtractor::GlyphInfo XpsTextExtractor::parseGlyphs(QXmlStreamReader &xml, QXmlStreamAttributes attrs = xml.attributes(); // 读取UnicodeString(文本内容) + // 根据libgxps (gxps-page.c:1012-1015),UnicodeString可能以"{}"转义序列开头,需要跳过 QString unicodeString = attrs.value(QLatin1String("UnicodeString")).toString(); if (unicodeString.isEmpty()) { // 跳过空文本的Glyphs return glyph; } + // 跳过开头的"{}"转义序列(根据libgxps的实现) + if (unicodeString.startsWith(QLatin1String("{}"))) { + unicodeString = unicodeString.mid(2); + } glyph.text = unicodeString; // 读取位置 @@ -365,7 +799,16 @@ QRectF XpsTextExtractor::calculateGlyphBoundingBox(const GlyphInfo &glyph) // 优先使用Indices中的精确宽度 if (!glyph.charWidths.isEmpty() && glyph.charWidths.size() == glyph.text.length()) { for (int i = 0; i < glyph.charWidths.size(); ++i) { - totalWidth += glyph.charWidths[i]; + if (glyph.charWidths[i] >= 0) { + // 有效的advanceWidth + totalWidth += glyph.charWidths[i]; + } else { + // -1.0标记表示glyphIndex(缺少advanceWidth),尝试从字体获取精确宽度 + if (i < glyph.text.length()) { + // calculateGlyphBoundingBox没有filePath参数,使用估算值作为fallback + totalWidth += estimateCharWidth(glyph.text.at(i), glyph.fontSize); + } + } } } else { // 如果没有Indices,使用估算 @@ -374,14 +817,11 @@ QRectF XpsTextExtractor::calculateGlyphBoundingBox(const GlyphInfo &glyph) } } - // 改进文本高度计算:考虑字体的ascent和descent - // 通常ascent约占字体大小的70-80%,descent约占20-30% double charAscent = glyph.fontSize * 0.75; double charDescent = glyph.fontSize * 0.25; double height = charAscent + charDescent; - // 创建基础边界框(在文本位置) - // 注意:XPS坐标系中,OriginY是文本基线位置,所以字符顶部 = OriginY - charAscent + // 创建基础边界框(在文本位置,OriginY是文本基线位置) QRectF baseRect(glyph.position.x(), glyph.position.y() - charAscent, totalWidth, height); // 应用变换矩阵 @@ -401,6 +841,11 @@ QTransform XpsTextExtractor::parseTransformMatrix(const QString &transformStr) } // XPS变换矩阵格式: "m11,m12,m21,m22,dx,dy" + // 根据libgxps gxps-matrix.c:gxps_matrix_parse: + // - 使用g_strsplit(data, ",", 6)分割成最多6个部分 + // - 检查g_strv_length(items) == 6确保正好6个值 + // - 使用g_ascii_strtod解析每个值(自动跳过前导空白) + // - 如果解析失败或值无效,返回FALSE #if QT_VERSION < QT_VERSION_CHECK(6, 0, 0) QStringList parts = transformStr.split(QLatin1Char(','), QString::SkipEmptyParts); #else @@ -410,18 +855,19 @@ QTransform XpsTextExtractor::parseTransformMatrix(const QString &transformStr) return QTransform(); } + // 解析6个值 bool ok = false; - double m11 = parts[0].toDouble(&ok); + double m11 = parts[0].trimmed().toDouble(&ok); if (!ok) return QTransform(); - double m12 = parts[1].toDouble(&ok); + double m12 = parts[1].trimmed().toDouble(&ok); if (!ok) return QTransform(); - double m21 = parts[2].toDouble(&ok); + double m21 = parts[2].trimmed().toDouble(&ok); if (!ok) return QTransform(); - double m22 = parts[3].toDouble(&ok); + double m22 = parts[3].trimmed().toDouble(&ok); if (!ok) return QTransform(); - double dx = parts[4].toDouble(&ok); + double dx = parts[4].trimmed().toDouble(&ok); if (!ok) return QTransform(); - double dy = parts[5].toDouble(&ok); + double dy = parts[5].trimmed().toDouble(&ok); if (!ok) return QTransform(); return QTransform(m11, m12, m21, m22, dx, dy); @@ -433,18 +879,10 @@ double XpsTextExtractor::estimateCharWidth(QChar ch, double fontSize) return 12.0; // 默认宽度 } - // 简单的字符宽度估算 + // 字符宽度估算 (当无法从字体文件获取实际宽度时使用) // 中文字符、日文、韩文等全角字符 if (ch.unicode() >= 0x4E00 && ch.unicode() <= 0x9FFF) { // CJK统一汉字 return fontSize; - } - if (ch.unicode() >= 0x3040 && ch.unicode() <= 0x309F) { // 平假名 - return fontSize; - } - if (ch.unicode() >= 0x30A0 && ch.unicode() <= 0x30FF) { // 片假名 - return fontSize; - } - if (ch.unicode() >= 0xAC00 && ch.unicode() <= 0xD7AF) { // 韩文 return fontSize; } @@ -478,9 +916,6 @@ QList XpsTextExtractor::parseIndices(const QString &indicesStr, int text return widths; } - // XPS Indices格式:每个字符对应一个值,用分号分隔 - // 格式可能是:",100;,100;,98.864;" 或 "100;100;98.864;" - // 值表示字符的advance width(相对于字体大小的比例,单位是1/100 em) #if QT_VERSION < QT_VERSION_CHECK(6, 0, 0) QStringList parts = indicesStr.split(QLatin1Char(';'), QString::SkipEmptyParts); #else @@ -489,25 +924,116 @@ QList XpsTextExtractor::parseIndices(const QString &indicesStr, int text for (int i = 0; i < parts.size() && i < textLength; ++i) { QString part = parts[i].trimmed(); - // 移除可能的逗号前缀 - if (part.startsWith(QLatin1Char(','))) { + if (part.isEmpty()) { + // 空部分,使用估算值标记 + widths.append(-1.0); + continue; + } + + // 移除可能的逗号前缀(处理 ",100," 这种格式) + while (part.startsWith(QLatin1Char(','))) { part = part.mid(1); } + // 检查是否包含逗号(成对格式:glyphIndex,advanceWidth 或 glyphIndex,advanceWidth,h_offset,v_offset) + QStringList commaParts = part.split(QLatin1Char(',')); + QString advanceWidthStr; + + if (commaParts.size() >= 2) { + // 成对格式:第一个值是glyphIndex,第二个值(索引1)是advanceWidth + advanceWidthStr = commaParts[1].trimmed(); + } else if (commaParts.size() == 1) { + // 单独数字:根据libgxps的逻辑,这应该是glyphIndex,而不是advanceWidth + // 当缺少advanceWidth时,使用-1.0标记,后续在extractWords中尝试从字体文件获取或使用估算值 + widths.append(-1.0); + continue; + } else { + // 无法解析,使用估算值标记 + widths.append(-1.0); + continue; + } + + // 解析advanceWidth值 + advanceWidthStr = advanceWidthStr.trimmed(); bool ok = false; - double value = part.toDouble(&ok); - if (ok && value > 0) { - // Indices中的值是相对于字体大小的比例(单位是1/100 em) - // 所以实际宽度 = fontSize * value / 100.0 + double value = advanceWidthStr.toDouble(&ok); + if (ok && value >= 0) { + // advanceWidth的单位是1/100 em,实际宽度 = fontSize * value / 100.0 double width = fontSize * value / 100.0; widths.append(width); } else { - // 如果解析失败,跳过(让调用者使用估算值) + // 解析失败,使用估算值标记 + widths.append(-1.0); } } - + return widths; } +QByteArray XpsTextExtractor::readFontFromZip(const QString &filePath, const QString &fontUri) +{ + if (filePath.isEmpty() || fontUri.isEmpty()) { + return QByteArray(); + } + + QString cleanFontUri = fontUri; + while (cleanFontUri.startsWith(QLatin1String("/")) || cleanFontUri.startsWith(QLatin1String("../"))) { + if (cleanFontUri.startsWith(QLatin1String("/"))) { + cleanFontUri = cleanFontUri.mid(1); + } else if (cleanFontUri.startsWith(QLatin1String("../"))) { + cleanFontUri = cleanFontUri.mid(3); + } + } + + struct archive *a = archive_read_new(); + struct archive_entry *entry = nullptr; + + if (!a) { + return QByteArray(); + } + + archive_read_support_format_zip(a); + archive_read_support_format_all(a); + + QByteArray utf8Path = QFile::encodeName(filePath); + int r = archive_read_open_filename(a, utf8Path.constData(), 10240); + if (r != ARCHIVE_OK) { + archive_read_free(a); + return QByteArray(); + } + + QByteArray result; + bool found = false; + + while (archive_read_next_header(a, &entry) == ARCHIVE_OK) { + QString entryPath = QString::fromUtf8(archive_entry_pathname(entry)); + + if (entryPath == fontUri || entryPath == cleanFontUri || + entryPath.endsWith(QLatin1String("/") + cleanFontUri) || + entryPath.contains(cleanFontUri.split(QLatin1String("/")).last())) { + la_int64_t size = archive_entry_size(entry); + if (size > 0 && size < 50 * 1024 * 1024) { + result.resize(static_cast(size)); + la_ssize_t readSize = archive_read_data(a, result.data(), size); + if (readSize == size) { + found = true; + } else { + result.clear(); + } + } + break; + } + archive_read_data_skip(a); + } + + archive_read_free(a); + + if (!found) { + return QByteArray(); + } + + return result; +} + } // namespace deepin_reader diff --git a/reader/document/XpsTextExtractor.h b/reader/document/XpsTextExtractor.h index c5e6dece..a00c9566 100644 --- a/reader/document/XpsTextExtractor.h +++ b/reader/document/XpsTextExtractor.h @@ -126,6 +126,14 @@ class XpsTextExtractor * @return FixedPage文件在ZIP中的路径,失败返回空字符串 */ static QString findFixedPagePath(const QString &filePath, int pageIndex); + + /** + * @brief 从ZIP文件读取字体文件 + * @param filePath XPS文件路径 + * @param fontUri 字体URI(相对路径) + * @return 字体文件数据 + */ + static QByteArray readFontFromZip(const QString &filePath, const QString &fontUri); }; } // namespace deepin_reader