From ea409d5c5c8dc346ced7874389b8b056373e93bc Mon Sep 17 00:00:00 2001 From: ZhangTingan Date: Tue, 11 Nov 2025 14:26:07 +0800 Subject: [PATCH 1/2] chore: [gitignore] add .cursor/ Log: as title --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 05726a80..7f6cd374 100644 --- a/.gitignore +++ b/.gitignore @@ -39,3 +39,4 @@ build .promptx/ .spec-workflow/ .cursorindexingignore +.cursor/ From feba3dbf2a3da29a6eca1227de99604a9f210cda Mon Sep 17 00:00:00 2001 From: ZhangTingan Date: Thu, 13 Nov 2025 14:27:25 +0800 Subject: [PATCH 2/2] feat: Add XPS text extraction and character-level selection Implement XPS text extraction by parsing FixedPage XML structure. Support both attribute and child element forms of RenderTransform to handle coordinate transformations correctly. Parse Indices attribute for precise character widths and support character-level selection. log: feature task: https://pms.uniontech.com/task-view-383459.html --- reader/browser/BrowserWord.cpp | 5 +- reader/document/XpsDocumentAdapter.cpp | 77 +++- reader/document/XpsDocumentAdapter.h | 2 + reader/document/XpsTextExtractor.cpp | 505 +++++++++++++++++++++++++ reader/document/XpsTextExtractor.h | 134 +++++++ 5 files changed, 716 insertions(+), 7 deletions(-) create mode 100644 reader/document/XpsTextExtractor.cpp create mode 100644 reader/document/XpsTextExtractor.h diff --git a/reader/browser/BrowserWord.cpp b/reader/browser/BrowserWord.cpp index a26cb11c..ab92ebfc 100644 --- a/reader/browser/BrowserWord.cpp +++ b/reader/browser/BrowserWord.cpp @@ -44,10 +44,7 @@ void BrowserWord::setSelectable(bool enable) QRectF BrowserWord::boundingRect() const { - // qCDebug(appLog) << "BrowserWord::boundingRect() - Calculating bounding rectangle"; - QRectF rect = QRectF(m_word.boundingBox.x() * m_scaleFactor - 1, m_word.boundingBox.y() * m_scaleFactor - 1, m_word.boundingBox.width() * m_scaleFactor + 2, m_word.boundingBox.height() * m_scaleFactor + 2); - // qCDebug(appLog) << "BrowserWord::boundingRect() - Bounding rectangle:" << rect; - return rect; + return QRectF(m_word.boundingBox.x() * m_scaleFactor - 1, m_word.boundingBox.y() * m_scaleFactor - 1, m_word.boundingBox.width() * m_scaleFactor + 2, m_word.boundingBox.height() * m_scaleFactor + 2); } QRectF BrowserWord::boundingBox() const diff --git a/reader/document/XpsDocumentAdapter.cpp b/reader/document/XpsDocumentAdapter.cpp index 9dc838e8..18ad5220 100644 --- a/reader/document/XpsDocumentAdapter.cpp +++ b/reader/document/XpsDocumentAdapter.cpp @@ -5,6 +5,7 @@ #include "XpsDocumentAdapter.h" +#include "XpsTextExtractor.h" #include "ddlog.h" #include @@ -14,6 +15,8 @@ #include #include +#include + #ifdef signals #pragma push_macro("signals") #undef signals @@ -795,8 +798,63 @@ Link XpsPageAdapter::getLinkAtPoint(const QPointF &point) QString XpsPageAdapter::text(const QRectF &rect) const { - Q_UNUSED(rect) - return QString(); + if (!m_document || rect.isEmpty()) { + return QString(); + } + + // 直接调用文本提取器,避免通过非const的words()方法 + QString filePath = m_document->filePath(); + if (filePath.isEmpty()) { + qCWarning(appLog) << "XpsPageAdapter::text() - Empty file path"; + return QString(); + } + + QList allWords = XpsTextExtractor::extractWords(filePath, m_pageIndex); + if (allWords.isEmpty()) { + return QString(); + } + + // 筛选在矩形区域内的words并按位置排序 + QList selectedWords; + for (const Word &word : allWords) { + if (rect.intersects(word.boundingBox)) { + selectedWords.append(word); + } + } + + if (selectedWords.isEmpty()) { + return QString(); + } + + // 按位置排序(从上到下,从左到右) + std::sort(selectedWords.begin(), selectedWords.end(), [](const Word &a, const Word &b) { + const double yThreshold = 5.0; // Y坐标容差 + if (qAbs(a.boundingBox.y() - b.boundingBox.y()) > yThreshold) { + return a.boundingBox.y() < b.boundingBox.y(); + } + return a.boundingBox.x() < b.boundingBox.x(); + }); + + // 拼接文本,处理换行 + QStringList textParts; + qreal lastY = selectedWords.first().boundingBox.y(); + const double yThreshold = 5.0; + + for (const Word &word : selectedWords) { + // 检测换行 + if (qAbs(word.boundingBox.y() - lastY) > yThreshold && !textParts.isEmpty()) { + // 如果Y坐标变化较大,可能是新行,添加换行符(可选) + // 这里先不添加,让调用者处理 + } + textParts.append(word.text); + lastY = word.boundingBox.y(); + } + + QString result = textParts.join(QStringLiteral(" ")); + + qCDebug(appLog) << "XpsPageAdapter::text() - Extracted text for rect" << rect << ":" << result; + + return result.simplified(); } QVector XpsPageAdapter::search(const QString &text, bool matchCase, bool wholeWords) const @@ -814,7 +872,20 @@ QList XpsPageAdapter::annotations() const QList XpsPageAdapter::words() { - return {}; + if (!m_document) { + qCWarning(appLog) << "XpsPageAdapter::words() - Invalid document"; + return {}; + } + + // 获取文档文件路径 + QString filePath = m_document->filePath(); + if (filePath.isEmpty()) { + qCWarning(appLog) << "XpsPageAdapter::words() - Empty file path"; + return {}; + } + + // 使用文本提取器提取文本 + return XpsTextExtractor::extractWords(filePath, m_pageIndex); } } // namespace deepin_reader diff --git a/reader/document/XpsDocumentAdapter.h b/reader/document/XpsDocumentAdapter.h index d55ecec9..afc7eb16 100644 --- a/reader/document/XpsDocumentAdapter.h +++ b/reader/document/XpsDocumentAdapter.h @@ -40,6 +40,8 @@ class XpsDocumentAdapter : public Document QImage renderPage(int pageIndex, int width, int height, const QRect &slice) const; QSizeF pageSize(int pageIndex) const; + QString filePath() const { return m_filePath; } + private: class Handle; diff --git a/reader/document/XpsTextExtractor.cpp b/reader/document/XpsTextExtractor.cpp new file mode 100644 index 00000000..2af729b5 --- /dev/null +++ b/reader/document/XpsTextExtractor.cpp @@ -0,0 +1,505 @@ +// Copyright (C) 2019 ~ 2025 Uniontech Software Technology Co.,Ltd. +// SPDX-FileCopyrightText: 2025 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "XpsTextExtractor.h" + +#include "ddlog.h" + +#include +#include +#include +#include + +#include + +#ifdef signals +#pragma push_macro("signals") +#undef signals +#endif +#ifdef slots +#pragma push_macro("slots") +#undef slots +#endif + +extern "C" { +#include +#include +} + +#ifdef slots +#pragma pop_macro("slots") +#endif +#ifdef signals +#pragma pop_macro("signals") +#endif + +namespace deepin_reader { + +QList XpsTextExtractor::extractWords(const QString &filePath, int pageIndex) +{ + QList textRuns = extractTextRuns(filePath, pageIndex); + if (textRuns.isEmpty()) { + return {}; + } + + // 将TextRun转换为Word列表 + // 为了支持字符级选择,需要将每个Glyph的文本拆分成单个字符 + QList allWords; + for (const TextRun &run : textRuns) { + if (!run.glyphs.isEmpty()) { + // 使用glyph信息,但将文本拆分成单个字符 + for (const GlyphInfo &glyph : run.glyphs) { + if (glyph.text.isEmpty() || glyph.boundingBox.isEmpty()) { + continue; + } + + // 计算每个字符的边界框 + qreal currentX = glyph.position.x(); + qreal baseY = glyph.position.y(); + // 改进字符高度计算:考虑字体的ascent和descent + // 通常ascent约占字体大小的70-80%,descent约占20-30% + // 使用更合理的估算:ascent = fontSize * 0.75, descent = fontSize * 0.25 + qreal charAscent = glyph.fontSize * 0.75; + qreal charDescent = glyph.fontSize * 0.25; + qreal charHeight = charAscent + charDescent; + + for (int i = 0; i < glyph.text.length(); ++i) { + QChar ch = glyph.text.at(i); + // 优先使用Indices中的精确宽度,否则使用估算 + double charWidth; + if (i < glyph.charWidths.size() && glyph.charWidths[i] > 0) { + charWidth = glyph.charWidths[i]; + } else { + charWidth = estimateCharWidth(ch, glyph.fontSize); + } + + // 创建单个字符的边界框 + // OriginY是基线位置,所以字符顶部 = baseY - charAscent + QRectF charBaseRect(currentX, baseY - charAscent, charWidth, charHeight); + + // 应用变换矩阵 + QPolygonF charBasePolygon; + charBasePolygon << charBaseRect.topLeft() << charBaseRect.topRight() + << charBaseRect.bottomRight() << charBaseRect.bottomLeft(); + QPolygonF transformedPolygon = glyph.transform.map(charBasePolygon); + QRectF charBoundingBox = transformedPolygon.boundingRect(); + + // 创建Word对象 + Word word; + word.text = QString(ch); + word.boundingBox = charBoundingBox; + allWords.append(word); + + // 移动到下一个字符位置 + currentX += charWidth; + } + } + } else if (!run.text.isEmpty() && !run.boundingBox.isEmpty()) { + // 如果没有glyph信息,将TextRun的文本拆分成单个字符 + // 估算每个字符的宽度 + qreal avgCharWidth = run.boundingBox.width() / run.text.length(); + qreal currentX = run.boundingBox.x(); + + for (int i = 0; i < run.text.length(); ++i) { + QChar ch = run.text.at(i); + Word word; + word.text = QString(ch); + word.boundingBox = QRectF(currentX, run.boundingBox.y(), avgCharWidth, run.boundingBox.height()); + allWords.append(word); + currentX += avgCharWidth; + } + } + } + + if (allWords.isEmpty()) { + return {}; + } + + // 按位置排序(从上到下,从左到右) + std::sort(allWords.begin(), allWords.end(), [](const Word &a, const Word &b) { + // 首先按Y坐标排序(从上到下) + const double yThreshold = 5.0; // Y坐标容差 + if (qAbs(a.boundingBox.y() - b.boundingBox.y()) > yThreshold) { + return a.boundingBox.y() < b.boundingBox.y(); + } + // Y坐标相近时,按X坐标排序(从左到右) + return a.boundingBox.x() < b.boundingBox.x(); + }); + + // 对于字符级选择,保持每个字符的精确边界框,不需要统一行高 + // 直接返回排序后的字符列表 + return allWords; +} + +QList XpsTextExtractor::extractTextRuns(const QString &filePath, int pageIndex) +{ + QByteArray xmlData = readFixedPageFromZip(filePath, pageIndex); + if (xmlData.isEmpty()) { + return {}; + } + + return parseFixedPage(xmlData, pageIndex); +} + +QByteArray XpsTextExtractor::readFixedPageFromZip(const QString &filePath, int pageIndex) +{ + QString fixedPagePath = findFixedPagePath(filePath, pageIndex); + if (fixedPagePath.isEmpty()) { + return QByteArray(); + } + + struct archive *a = archive_read_new(); + struct archive_entry *entry = nullptr; + + if (!a) { + return QByteArray(); + } + + archive_read_support_format_zip(a); + archive_read_support_format_all(a); + + QByteArray utf8Path = QFile::encodeName(filePath); + int r = archive_read_open_filename(a, utf8Path.constData(), 10240); + if (r != ARCHIVE_OK) { + archive_read_free(a); + return QByteArray(); + } + + QByteArray result; + bool found = false; + + while (archive_read_next_header(a, &entry) == ARCHIVE_OK) { + QString entryPath = QString::fromUtf8(archive_entry_pathname(entry)); + if (entryPath == fixedPagePath) { + + la_int64_t size = archive_entry_size(entry); + if (size > 0 && size < 100 * 1024 * 1024) { // 限制100MB + result.resize(static_cast(size)); + la_ssize_t readSize = archive_read_data(a, result.data(), size); + if (readSize == size) { + found = true; + } else { + // 读取失败,清理并返回 + archive_read_free(a); + return QByteArray(); + } + } + break; + } + } + + archive_read_free(a); + + if (!found) { + return QByteArray(); + } + + return result; +} + +QString XpsTextExtractor::findFixedPagePath(const QString &filePath, int pageIndex) +{ + Q_UNUSED(filePath) // 当前实现使用标准路径,未来可以解析[Content_Types].xml + + if (pageIndex < 0) { + return QString(); + } + + // XPS标准格式: Documents/1/Pages/{pageNumber}.fpage + // pageIndex从0开始,但XPS页面编号通常从1开始 + int pageNumber = pageIndex + 1; + return QStringLiteral("Documents/1/Pages/%1.fpage").arg(pageNumber); +} + +QList XpsTextExtractor::parseFixedPage(const QByteArray &xmlData, int pageIndex) +{ + Q_UNUSED(pageIndex); + QList textRuns; + QXmlStreamReader xml(xmlData); + QList transformStack; + transformStack.append(QTransform()); // 初始单位矩阵 + + // 用于跟踪当前是否在RenderTransform子元素内部 + int renderTransformDepth = 0; + + while (!xml.atEnd() && !xml.hasError()) { + QXmlStreamReader::TokenType token = xml.readNext(); + + if (token == QXmlStreamReader::StartElement) { + QString elementName = xml.name().toString(); + + if (elementName == QLatin1String("FixedPage")) { + // 检查RenderTransform属性 + QString transformStr = xml.attributes().value(QLatin1String("RenderTransform")).toString(); + if (!transformStr.isEmpty()) { + QTransform transform = parseTransformMatrix(transformStr); + transformStack.last() = transformStack.last() * transform; + } + } else if (elementName == QLatin1String("Canvas") || elementName == QLatin1String("Path")) { + // 处理嵌套变换 + QString transformStr = xml.attributes().value(QLatin1String("RenderTransform")).toString(); + if (!transformStr.isEmpty()) { + // 属性形式的RenderTransform + QTransform transform = parseTransformMatrix(transformStr); + transformStack.append(transformStack.last() * transform); + } else { + // 没有属性形式的RenderTransform,可能需要查找子元素形式 + transformStack.append(transformStack.last()); + } + } else if (elementName == QLatin1String("Canvas.RenderTransform") || + elementName == QLatin1String("Path.RenderTransform")) { + // 进入RenderTransform子元素 + renderTransformDepth++; + } else if (elementName == QLatin1String("MatrixTransform")) { + // 检查是否在Canvas.RenderTransform或Path.RenderTransform内部 + // 如果是,解析Matrix属性并应用到当前transformStack + if (renderTransformDepth > 0 && !transformStack.isEmpty()) { + QString matrixStr = xml.attributes().value(QLatin1String("Matrix")).toString(); + if (!matrixStr.isEmpty()) { + QTransform transform = parseTransformMatrix(matrixStr); + transformStack.last() = transformStack.last() * transform; + } + } + } else if (elementName == QLatin1String("Glyphs")) { + QTransform parentTransform = transformStack.isEmpty() ? QTransform() : transformStack.last(); + GlyphInfo glyph = parseGlyphs(xml, parentTransform); + if (!glyph.text.isEmpty()) { + TextRun run; + run.text = glyph.text; + run.boundingBox = glyph.boundingBox; + run.glyphs.append(glyph); + textRuns.append(run); + } + } + } else if (token == QXmlStreamReader::EndElement) { + QString elementName = xml.name().toString(); + if (elementName == QLatin1String("Canvas") || elementName == QLatin1String("Path")) { + if (transformStack.size() > 1) { + transformStack.removeLast(); + } + } else if (elementName == QLatin1String("Canvas.RenderTransform") || + elementName == QLatin1String("Path.RenderTransform")) { + // RenderTransform子元素结束 + if (renderTransformDepth > 0) { + renderTransformDepth--; + } + } + } + } + + if (xml.hasError()) { + return {}; + } + + return textRuns; +} + +XpsTextExtractor::GlyphInfo XpsTextExtractor::parseGlyphs(QXmlStreamReader &xml, const QTransform &parentTransform) +{ + GlyphInfo glyph; + glyph.transform = parentTransform; + glyph.fontSize = 12.0; // 默认字体大小 + + QXmlStreamAttributes attrs = xml.attributes(); + + // 读取UnicodeString(文本内容) + QString unicodeString = attrs.value(QLatin1String("UnicodeString")).toString(); + if (unicodeString.isEmpty()) { + // 跳过空文本的Glyphs + return glyph; + } + glyph.text = unicodeString; + + // 读取位置 + bool okX = false, okY = false; + double originX = attrs.value(QLatin1String("OriginX")).toDouble(&okX); + double originY = attrs.value(QLatin1String("OriginY")).toDouble(&okY); + if (!okX || !okY) { + return glyph; + } + glyph.position = QPointF(originX, originY); + + // 读取字体大小 + bool okSize = false; + double fontSize = attrs.value(QLatin1String("FontRenderingEmSize")).toDouble(&okSize); + if (!okSize || fontSize <= 0) { + fontSize = 12.0; // 默认值 + } + glyph.fontSize = fontSize; + + // 读取字体URI + glyph.fontUri = attrs.value(QLatin1String("FontUri")).toString(); + + // 读取Indices属性(字符级位置信息) + QString indicesStr = attrs.value(QLatin1String("Indices")).toString(); + if (!indicesStr.isEmpty()) { + // 解析Indices获取精确的字符宽度 + glyph.charWidths = parseIndices(indicesStr, unicodeString.length(), fontSize); + } else { + glyph.charWidths.clear(); + } + + // 读取RenderTransform(如果有) + QString transformStr = attrs.value(QLatin1String("RenderTransform")).toString(); + if (!transformStr.isEmpty()) { + QTransform localTransform = parseTransformMatrix(transformStr); + glyph.transform = parentTransform * localTransform; + } + + // 计算边界框 + glyph.boundingBox = calculateGlyphBoundingBox(glyph); + + return glyph; +} + +QRectF XpsTextExtractor::calculateGlyphBoundingBox(const GlyphInfo &glyph) +{ + if (glyph.text.isEmpty()) { + return QRectF(); + } + + // 估算文本宽度 + double totalWidth = 0.0; + // 优先使用Indices中的精确宽度 + if (!glyph.charWidths.isEmpty() && glyph.charWidths.size() == glyph.text.length()) { + for (int i = 0; i < glyph.charWidths.size(); ++i) { + totalWidth += glyph.charWidths[i]; + } + } else { + // 如果没有Indices,使用估算 + for (int i = 0; i < glyph.text.length(); ++i) { + totalWidth += estimateCharWidth(glyph.text.at(i), glyph.fontSize); + } + } + + // 改进文本高度计算:考虑字体的ascent和descent + // 通常ascent约占字体大小的70-80%,descent约占20-30% + double charAscent = glyph.fontSize * 0.75; + double charDescent = glyph.fontSize * 0.25; + double height = charAscent + charDescent; + + // 创建基础边界框(在文本位置) + // 注意:XPS坐标系中,OriginY是文本基线位置,所以字符顶部 = OriginY - charAscent + QRectF baseRect(glyph.position.x(), glyph.position.y() - charAscent, totalWidth, height); + + // 应用变换矩阵 + QPolygonF basePolygon; + basePolygon << baseRect.topLeft() << baseRect.topRight() + << baseRect.bottomRight() << baseRect.bottomLeft(); + QPolygonF transformedPolygon = glyph.transform.map(basePolygon); + QRectF transformedRect = transformedPolygon.boundingRect(); + + return transformedRect; +} + +QTransform XpsTextExtractor::parseTransformMatrix(const QString &transformStr) +{ + if (transformStr.isEmpty()) { + return QTransform(); + } + + // XPS变换矩阵格式: "m11,m12,m21,m22,dx,dy" + QStringList parts = transformStr.split(QLatin1Char(','), QString::SkipEmptyParts); + if (parts.size() != 6) { + return QTransform(); + } + + bool ok = false; + double m11 = parts[0].toDouble(&ok); + if (!ok) return QTransform(); + double m12 = parts[1].toDouble(&ok); + if (!ok) return QTransform(); + double m21 = parts[2].toDouble(&ok); + if (!ok) return QTransform(); + double m22 = parts[3].toDouble(&ok); + if (!ok) return QTransform(); + double dx = parts[4].toDouble(&ok); + if (!ok) return QTransform(); + double dy = parts[5].toDouble(&ok); + if (!ok) return QTransform(); + + return QTransform(m11, m12, m21, m22, dx, dy); +} + +double XpsTextExtractor::estimateCharWidth(QChar ch, double fontSize) +{ + if (fontSize <= 0) { + return 12.0; // 默认宽度 + } + + // 简单的字符宽度估算 + // 中文字符、日文、韩文等全角字符 + if (ch.unicode() >= 0x4E00 && ch.unicode() <= 0x9FFF) { // CJK统一汉字 + return fontSize; + } + if (ch.unicode() >= 0x3040 && ch.unicode() <= 0x309F) { // 平假名 + return fontSize; + } + if (ch.unicode() >= 0x30A0 && ch.unicode() <= 0x30FF) { // 片假名 + return fontSize; + } + if (ch.unicode() >= 0xAC00 && ch.unicode() <= 0xD7AF) { // 韩文 + return fontSize; + } + + // 英文字母和数字 + if ((ch >= QLatin1Char('A') && ch <= QLatin1Char('Z')) || + (ch >= QLatin1Char('a') && ch <= QLatin1Char('z'))) { + return fontSize * 0.6; + } + if (ch >= QLatin1Char('0') && ch <= QLatin1Char('9')) { + return fontSize * 0.5; + } + + // 空格 + if (ch == QLatin1Char(' ') || ch == QChar(0x00A0)) { // 普通空格或非断行空格 + return fontSize * 0.3; + } + + // 标点符号等,使用中等宽度 + if (ch.isPunct()) { + return fontSize * 0.4; + } + + // 其他字符,使用默认值 + return fontSize * 0.6; +} + +QList XpsTextExtractor::parseIndices(const QString &indicesStr, int textLength, double fontSize) +{ + QList widths; + if (indicesStr.isEmpty() || textLength <= 0) { + return widths; + } + + // XPS Indices格式:每个字符对应一个值,用分号分隔 + // 格式可能是:",100;,100;,98.864;" 或 "100;100;98.864;" + // 值表示字符的advance width(相对于字体大小的比例,单位是1/100 em) + QStringList parts = indicesStr.split(QLatin1Char(';'), QString::SkipEmptyParts); + + for (int i = 0; i < parts.size() && i < textLength; ++i) { + QString part = parts[i].trimmed(); + // 移除可能的逗号前缀 + if (part.startsWith(QLatin1Char(','))) { + part = part.mid(1); + } + + bool ok = false; + double value = part.toDouble(&ok); + if (ok && value > 0) { + // Indices中的值是相对于字体大小的比例(单位是1/100 em) + // 所以实际宽度 = fontSize * value / 100.0 + double width = fontSize * value / 100.0; + widths.append(width); + } else { + // 如果解析失败,跳过(让调用者使用估算值) + } + } + + return widths; +} + +} // namespace deepin_reader + diff --git a/reader/document/XpsTextExtractor.h b/reader/document/XpsTextExtractor.h new file mode 100644 index 00000000..c5e6dece --- /dev/null +++ b/reader/document/XpsTextExtractor.h @@ -0,0 +1,134 @@ +// Copyright (C) 2019 ~ 2025 Uniontech Software Technology Co.,Ltd. +// SPDX-FileCopyrightText: 2025 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef XPSTEXTEXTRACTOR_H +#define XPSTEXTEXTRACTOR_H + +#include "Model.h" + +#include +#include +#include +#include +#include + +class QXmlStreamReader; + +namespace deepin_reader { + +/** + * @brief XPS文本提取器 + * 从XPS文件中提取文本和坐标信息 + */ +class XpsTextExtractor +{ +public: + /** + * @brief Glyph信息结构 + */ + struct GlyphInfo { + QString text; // 字符文本 + QPointF position; // 基础位置 (OriginX, OriginY) + QRectF boundingBox; // 字符边界框 + double fontSize; // 字体大小(点) + QString fontUri; // 字体资源URI(相对路径) + QTransform transform; // 应用的变换矩阵 + QList charWidths; // 字符宽度列表(从Indices解析,如果可用) + }; + + /** + * @brief 文本运行结构 + */ + struct TextRun { + QString text; // 完整文本 + QRectF boundingBox; // 文本运行边界框 + QList glyphs; // 字符列表 + }; + + /** + * @brief 从XPS文件路径提取指定页面的文本 + * @param filePath XPS文件路径 + * @param pageIndex 页面索引(从0开始) + * @return Word列表 + */ + static QList extractWords(const QString &filePath, int pageIndex); + + /** + * @brief 从XPS文件路径提取指定页面的文本(返回详细信息) + * @param filePath XPS文件路径 + * @param pageIndex 页面索引(从0开始) + * @return TextRun列表 + */ + static QList extractTextRuns(const QString &filePath, int pageIndex); + +private: + /** + * @brief 解析FixedPage XML文件 + * @param xmlData XML数据 + * @param pageIndex 页面索引(用于日志) + * @return TextRun列表 + */ + static QList parseFixedPage(const QByteArray &xmlData, int pageIndex); + + /** + * @brief 解析Glyphs元素 + * @param xml XML读取器(当前位置应在Glyphs元素) + * @param parentTransform 父变换矩阵 + * @return GlyphInfo结构 + */ + static GlyphInfo parseGlyphs(QXmlStreamReader &xml, const QTransform &parentTransform); + + /** + * @brief 计算字符边界框 + * @param glyph 字符信息 + * @return 边界框 + */ + static QRectF calculateGlyphBoundingBox(const GlyphInfo &glyph); + + /** + * @brief 解析变换矩阵字符串 "m11,m12,m21,m22,dx,dy" + * @param transformStr 变换矩阵字符串 + * @return QTransform对象 + */ + static QTransform parseTransformMatrix(const QString &transformStr); + + /** + * @brief 估算字符宽度(当字体不可用时) + * @param ch 字符 + * @param fontSize 字体大小 + * @return 估算的字符宽度 + */ + static double estimateCharWidth(QChar ch, double fontSize); + + /** + * @brief 解析XPS Indices属性,获取字符宽度列表 + * @param indicesStr Indices属性字符串 + * @param textLength 文本长度 + * @param fontSize 字体大小 + * @return 字符宽度列表(单位:点) + */ + static QList parseIndices(const QString &indicesStr, int textLength, double fontSize); + + /** + * @brief 从XPS ZIP包中读取FixedPage文件 + * @param filePath XPS文件路径 + * @param pageIndex 页面索引 + * @return XML数据,失败返回空QByteArray + */ + static QByteArray readFixedPageFromZip(const QString &filePath, int pageIndex); + + /** + * @brief 查找FixedPage文件路径 + * @param filePath XPS文件路径 + * @param pageIndex 页面索引 + * @return FixedPage文件在ZIP中的路径,失败返回空字符串 + */ + static QString findFixedPagePath(const QString &filePath, int pageIndex); +}; + +} // namespace deepin_reader + +#endif // XPSTEXTEXTRACTOR_H +