From d95e1cda78b9023f00b6ca0231b0ff05c935d29b Mon Sep 17 00:00:00 2001
From: qin-ctx <qinhaojie.exe@bytedance.com>
Date: Wed, 4 Mar 2026 17:38:25 +0800
Subject: [PATCH 1/3] feat(parse): auto-group sections into subdirectories when
 file count exceeds limit

When a document is split into many parts, automatically organize them into
subdirectories to avoid having too many files in a single directory. Also
refactors PDF bookmark extraction for clarity and uses defaultdict.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 openviking/parse/parsers/markdown.py         |  84 +++++++++++++--
 openviking/parse/parsers/pdf.py              | 103 ++++++++-----------
 openviking_cli/utils/config/parser_config.py |   3 +
 3 files changed, 123 insertions(+), 67 deletions(-)

diff --git a/openviking/parse/parsers/markdown.py b/openviking/parse/parsers/markdown.py
index 7301b335..110f1477 100644
--- a/openviking/parse/parsers/markdown.py
+++ b/openviking/parse/parsers/markdown.py
@@ -52,6 +52,7 @@ class MarkdownParser(BaseParser):
     DEFAULT_MAX_SECTION_SIZE = 1024  # Maximum tokens per section
     DEFAULT_MIN_SECTION_TOKENS = 512  # Minimum tokens to create a separate section
     MAX_MERGED_FILENAME_LENGTH = 32  # Maximum length for merged section filenames
+    MAX_CHILDREN_PER_DIR = 50  # Maximum files per directory
 
     def __init__(
         self,
@@ -399,10 +400,24 @@ async def _parse_and_create_structure(
         if not headings:
             logger.info("[MarkdownParser] No headings, splitting by paragraphs")
             parts = self._smart_split_content(content, max_size)
-            for part_idx, part in enumerate(parts, 1):
-                part_file = f"{root_dir}/{doc_name}_{part_idx}.md"
-                await viking_fs.write_file(part_file, part)
-            logger.debug(f"[MarkdownParser] Split into {len(parts)} parts")
+            max_children = self.config.max_children_per_dir or self.MAX_CHILDREN_PER_DIR
+            groups = self._auto_group_sections(parts, doc_name, max_children)
+
+            for group_idx, (subdir_name, group_parts) in enumerate(groups):
+                if subdir_name:
+                    subdir_path = f"{root_dir}/{subdir_name}"
+                    await viking_fs.mkdir(subdir_path)
+                    base_path = subdir_path
+                else:
+                    base_path = root_dir
+
+                offset = group_idx * max_children
+                for part_idx, part in enumerate(group_parts, offset + 1):
+                    await viking_fs.write_file(f"{base_path}/{doc_name}_{part_idx}.md", part)
+
+            logger.debug(
+                f"[MarkdownParser] Split into {len(parts)} parts across {len(groups)} groups"
+            )
             return
 
         # Build virtual section list (pre-heading content as first virtual section)
@@ -450,8 +465,6 @@ async def _process_sections_with_merge(
         min_size: int,
     ) -> None:
         """Process sections with small section merge logic."""
-        viking_fs = self._get_viking_fs()
-
         # Expand section info
         expanded = [
             section
@@ -460,6 +473,39 @@ async def _process_sections_with_merge(
             for section in sections
         ]
 
+        # Auto-group when too many sibling sections into subdirectories
+        max_children = self.config.max_children_per_dir or self.MAX_CHILDREN_PER_DIR
+        if len(expanded) > max_children:
+            viking_fs = self._get_viking_fs()
+            for i in range(0, len(expanded), max_children):
+                chunk = expanded[i : i + max_children]
+                first_name = chunk[0]["name"]
+                last_name = chunk[-1]["name"]
+                subdir_name = self._sanitize_for_path(f"{first_name}_to_{last_name}")
+                subdir_path = f"{parent_dir}/{subdir_name}"
+                await viking_fs.mkdir(subdir_path, exist_ok=True)
+                await self._process_expanded_sections(
+                    content, headings, subdir_path, chunk, parent_name, max_size, min_size
+                )
+            return
+
+        await self._process_expanded_sections(
+            content, headings, parent_dir, expanded, parent_name, max_size, min_size
+        )
+
+    async def _process_expanded_sections(
+        self,
+        content: str,
+        headings: List[Tuple[int, int, str, int]],
+        parent_dir: str,
+        expanded: List[Dict[str, Any]],
+        parent_name: str,
+        max_size: int,
+        min_size: int,
+    ) -> None:
+        """Process a list of already-expanded sections with merge logic."""
+        viking_fs = self._get_viking_fs()
+
         pending = []
         for sec in expanded:
             name, tokens, content_text = sec["name"], sec["tokens"], sec["content"]
@@ -689,3 +735,29 @@ def _estimate_token_count(self, content: str) -> int:
         cjk_chars = len(re.findall(r"[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]", content))
         other_chars = len(re.findall(r"[^\s]", content)) - cjk_chars
         return int(cjk_chars * 0.7 + other_chars * 0.3)
+
+    def _auto_group_sections(
+        self, parts: List[str], doc_name: str, group_size: int
+    ) -> List[Tuple[str, List[str]]]:
+        """Group flat parts into subdirectories.
+
+        Args:
+            parts: List of content parts
+            doc_name: Document name
+            group_size: Maximum number of files per group
+
+        Returns:
+            List of (subdir_name, parts) tuples.
+            Empty subdir_name means no subdirectory needed.
+        """
+        if len(parts) <= group_size:
+            return [("", parts)]
+
+        groups = []
+        for i in range(0, len(parts), group_size):
+            chunk = parts[i : i + group_size]
+            start = i + 1
+            end = i + len(chunk)
+            subdir = f"{doc_name}_{start:03d}-{end:03d}"
+            groups.append((subdir, chunk))
+        return groups
diff --git a/openviking/parse/parsers/pdf.py b/openviking/parse/parsers/pdf.py
index cb146961..592fbe12 100644
--- a/openviking/parse/parsers/pdf.py
+++ b/openviking/parse/parsers/pdf.py
@@ -14,6 +14,7 @@
 
 import logging
 import time
+from collections import defaultdict
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
@@ -190,73 +191,54 @@ async def _convert_to_markdown(self, pdf_path: Path) -> tuple[str, Dict[str, Any
 
     def _extract_bookmarks(self, pdf) -> List[Dict[str, Any]]:
         """
-        Extract PDF bookmarks/outlines and map them to page numbers.
-
-        Uses pdfplumber's underlying pdfminer to access the PDF document
-        outline (table of contents). Each bookmark entry is mapped to a
-        page number by resolving its destination object.
-
-        Args:
-            pdf: An open pdfplumber PDF object
-
-        Returns:
-            List of dicts with keys: title, level, page_num (1-based).
-            Empty list if no bookmarks are found or extraction fails.
+        Returns: [{level: int, title: str, page_num: int(1-based)}]
         """
-        bookmarks: List[Dict[str, Any]] = []
-
         try:
-            # Access pdfminer's document object through pdfplumber
-            doc = pdf.doc
-            if not hasattr(doc, "get_outlines"):
+            if not hasattr(pdf, "doc") or not hasattr(pdf.doc, "get_outlines"):
                 return []
 
-            # Build a mapping from pdfminer page objects to page numbers
-            # pdfplumber pages are 0-indexed internally
-            objid_to_pagenum: Dict[int, int] = {}
-            for i, page in enumerate(pdf.pages):
-                if hasattr(page, "page_obj") and hasattr(page.page_obj, "objid"):
-                    objid_to_pagenum[page.page_obj.objid] = i + 1  # 1-based
+            outlines = pdf.doc.get_outlines()
+            if not outlines:
+                return []
 
-            for level, title, dest, _a, _se in doc.get_outlines():
+            # Build a mapping from pdfminer page objects to page numbers
+            objid_to_pagenum = {
+                page.page_obj.objid: i + 1
+                for i, page in enumerate(pdf.pages)
+                if hasattr(page, "page_obj") and hasattr(page.page_obj, "objid")
+            }
+
+            bookmarks = []
+            for level, title, dest, _action, _se in outlines:
                 if not title or not title.strip():
                     continue
 
                 page_num = None
-
-                # Resolve destination to page number
-                # dest can be various types depending on the PDF structure
-                if dest:
-                    try:
-                        # dest is typically a list where first element is a page reference
-                        if isinstance(dest, (list, tuple)) and len(dest) > 0:
-                            page_ref = dest[0]
-                            if hasattr(page_ref, "objid"):
-                                page_num = objid_to_pagenum.get(page_ref.objid)
-                            elif hasattr(page_ref, "resolve"):
-                                resolved = page_ref.resolve()
-                                if hasattr(resolved, "objid"):
-                                    page_num = objid_to_pagenum.get(resolved.objid)
-                    except Exception:
-                        pass  # Best-effort resolution
-
-                # Cap heading level to 1-6 for markdown compatibility
-                md_level = min(max(level, 1), 6)
+                try:
+                    if dest and len(dest) > 0:
+                        page_ref = dest[0]
+                        if hasattr(page_ref, "objid"):
+                            page_num = objid_to_pagenum.get(page_ref.objid)
+                        elif hasattr(page_ref, "resolve"):
+                            resolved = page_ref.resolve()
+                            if hasattr(resolved, "objid"):
+                                page_num = objid_to_pagenum.get(resolved.objid)
+                except Exception:
+                    pass
 
                 bookmarks.append(
                     {
+                        "level": min(max(level, 1), 6),
                         "title": title.strip(),
-                        "level": md_level,
-                        "page_num": page_num,  # May be None if resolution failed
+                        "page_num": page_num,
                     }
                 )
 
-            logger.info(f"Extracted {len(bookmarks)} bookmarks from PDF outline")
+            return bookmarks
 
         except Exception as e:
-            logger.debug(f"Bookmark extraction failed (PDF may have no outlines): {e}")
-
-        return bookmarks
+            logger.warning(f"Failed to extract bookmarks: {e}")
+            return []
 
     async def _convert_local(
         self, pdf_path: Path, storage=None, resource_name: Optional[str] = None
@@ -306,24 +288,23 @@ async def _convert_local(
             with pdfplumber.open(str(pdf_path)) as pdf:
                 meta["total_pages"] = len(pdf.pages)
 
-                # Step 1: Extract bookmarks and group by page number
+                # Extract bookmarks and group by page number
                 bookmarks = self._extract_bookmarks(pdf)
                 meta["bookmarks_extracted"] = len(bookmarks)
+                logger.info(f"Extracted {len(bookmarks)} bookmarks")
 
-                # Build a lookup: page_num -> list of bookmarks to inject before that page's content
-                bookmarks_by_page: Dict[int, List[Dict[str, Any]]] = {}
+                # Build a lookup: page_num -> list of bookmarks
+                bookmarks_by_page = defaultdict(list)
                 for bm in bookmarks:
-                    pg = bm.get("page_num")
-                    if pg is not None:
-                        bookmarks_by_page.setdefault(pg, []).append(bm)
+                    if bm["page_num"]:
+                        bookmarks_by_page[bm["page_num"]].append(bm)
 
-                # Step 2: Extract content page by page, injecting bookmark headings
                 for page_num, page in enumerate(pdf.pages, 1):
-                    # Inject bookmark headings for this page (before page content)
-                    if page_num in bookmarks_by_page:
-                        for bm in bookmarks_by_page[page_num]:
-                            heading_prefix = "#" * bm["level"]
-                            parts.append(f"{heading_prefix} {bm['title']}")
+                    # Inject bookmark headings for this page
+                    page_bookmarks = bookmarks_by_page.get(page_num, [])
+                    for bm in page_bookmarks:
+                        heading_prefix = "#" * bm["level"]
+                        parts.append(f"\n{heading_prefix} {bm['title']}\n")
 
                     # Extract text
                     text = page.extract_text()
diff --git a/openviking_cli/utils/config/parser_config.py b/openviking_cli/utils/config/parser_config.py
index abd43fed..5e52d514 100644
--- a/openviking_cli/utils/config/parser_config.py
+++ b/openviking_cli/utils/config/parser_config.py
@@ -37,6 +37,9 @@ class ParserConfig:
     max_section_size: int = 1000  # Maximum tokens per section before splitting
     section_size_flexibility: float = 0.3  # Allow 30% overflow to maintain coherence
 
+    # Directory structure configuration
+    max_children_per_dir: int = 50  # Maximum files per directory (0=unlimited)
+
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "ParserConfig":
         """

From 17621251ffdb7b866ba792ed934a8e93d0ca3d4d Mon Sep 17 00:00:00 2001
From: qin-ctx <qinhaojie.exe@bytedance.com>
Date: Wed, 4 Mar 2026 18:00:04 +0800
Subject: [PATCH 2/3] refactor: remove auto-group directory logic and translate
 comments to English

Remove max_children_per_dir config and auto-grouping of sections into
subdirectories when file count exceeds a limit. Translate all Chinese
comments and docstrings to English across pdf.py and parser_config.py.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 openviking/parse/parsers/markdown.py         |  64 +----
 openviking/parse/parsers/pdf.py              | 254 ++++++++++++++-----
 openviking_cli/utils/config/parser_config.py |  14 +-
 3 files changed, 208 insertions(+), 124 deletions(-)

diff --git a/openviking/parse/parsers/markdown.py b/openviking/parse/parsers/markdown.py
index 110f1477..10393311 100644
--- a/openviking/parse/parsers/markdown.py
+++ b/openviking/parse/parsers/markdown.py
@@ -52,7 +52,6 @@ class MarkdownParser(BaseParser):
     DEFAULT_MAX_SECTION_SIZE = 1024  # Maximum tokens per section
     DEFAULT_MIN_SECTION_TOKENS = 512  # Minimum tokens to create a separate section
     MAX_MERGED_FILENAME_LENGTH = 32  # Maximum length for merged section filenames
-    MAX_CHILDREN_PER_DIR = 50  # Maximum files per directory
 
     def __init__(
         self,
@@ -400,24 +399,9 @@ async def _parse_and_create_structure(
         if not headings:
             logger.info("[MarkdownParser] No headings, splitting by paragraphs")
             parts = self._smart_split_content(content, max_size)
-            max_children = self.config.max_children_per_dir or self.MAX_CHILDREN_PER_DIR
-            groups = self._auto_group_sections(parts, doc_name, max_children)
-
-            for group_idx, (subdir_name, group_parts) in enumerate(groups):
-                if subdir_name:
-                    subdir_path = f"{root_dir}/{subdir_name}"
-                    await viking_fs.mkdir(subdir_path)
-                    base_path = subdir_path
-                else:
-                    base_path = root_dir
-
-                offset = group_idx * max_children
-                for part_idx, part in enumerate(group_parts, offset + 1):
-                    await viking_fs.write_file(f"{base_path}/{doc_name}_{part_idx}.md", part)
-
-            logger.debug(
-                f"[MarkdownParser] Split into {len(parts)} parts across {len(groups)} groups"
-            )
+            for part_idx, part in enumerate(parts, 1):
+                await viking_fs.write_file(f"{root_dir}/{doc_name}_{part_idx}.md", part)
+            logger.debug(f"[MarkdownParser] Split into {len(parts)} parts")
             return
 
         # Build virtual section list (pre-heading content as first virtual section)
@@ -473,22 +457,6 @@ async def _process_sections_with_merge(
             for section in sections
         ]
 
-        # Auto-group when too many sibling sections into subdirectories
-        max_children = self.config.max_children_per_dir or self.MAX_CHILDREN_PER_DIR
-        if len(expanded) > max_children:
-            viking_fs = self._get_viking_fs()
-            for i in range(0, len(expanded), max_children):
-                chunk = expanded[i : i + max_children]
-                first_name = chunk[0]["name"]
-                last_name = chunk[-1]["name"]
-                subdir_name = self._sanitize_for_path(f"{first_name}_to_{last_name}")
-                subdir_path = f"{parent_dir}/{subdir_name}"
-                await viking_fs.mkdir(subdir_path, exist_ok=True)
-                await self._process_expanded_sections(
-                    content, headings, subdir_path, chunk, parent_name, max_size, min_size
-                )
-            return
-
         await self._process_expanded_sections(
             content, headings, parent_dir, expanded, parent_name, max_size, min_size
         )
@@ -735,29 +703,3 @@ def _estimate_token_count(self, content: str) -> int:
         cjk_chars = len(re.findall(r"[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]", content))
         other_chars = len(re.findall(r"[^\s]", content)) - cjk_chars
         return int(cjk_chars * 0.7 + other_chars * 0.3)
-
-    def _auto_group_sections(
-        self, parts: List[str], doc_name: str, group_size: int
-    ) -> List[Tuple[str, List[str]]]:
-        """Group flat parts into subdirectories.
-
-        Args:
-            parts: List of content parts
-            doc_name: Document name
-            group_size: Maximum number of files per group
-
-        Returns:
-            List of (subdir_name, parts) tuples.
-            Empty subdir_name means no subdirectory needed.
-        """
-        if len(parts) <= group_size:
-            return [("", parts)]
-
-        groups = []
-        for i in range(0, len(parts), group_size):
-            chunk = parts[i : i + group_size]
-            start = i + 1
-            end = i + len(chunk)
-            subdir = f"{doc_name}_{start:03d}-{end:03d}"
-            groups.append((subdir, chunk))
-        return groups
diff --git a/openviking/parse/parsers/pdf.py b/openviking/parse/parsers/pdf.py
index 592fbe12..9fb19743 100644
--- a/openviking/parse/parsers/pdf.py
+++ b/openviking/parse/parsers/pdf.py
@@ -13,8 +13,9 @@
 """
 
 import logging
+import re
 import time
-from collections import defaultdict
+from collections import Counter, defaultdict
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
@@ -189,57 +190,6 @@ async def _convert_to_markdown(self, pdf_path: Path) -> tuple[str, Dict[str, Any
         else:
             raise ValueError(f"Unknown strategy: {self.config.strategy}")
 
-    def _extract_bookmarks(self, pdf) -> List[Dict[str, Any]]:
-        """
-        Returns: [{level: int, title: str, page_num: int(1-based)}]
-        """
-        try:
-            if not hasattr(pdf, "doc") or not hasattr(pdf.doc, "get_outlines"):
-                return []
-
-            outlines = pdf.doc.get_outlines()
-            if not outlines:
-                return []
-
-            # Build a mapping from pdfminer page objects to page numbers
-            objid_to_pagenum = {
-                page.page_obj.objid: i + 1
-                for i, page in enumerate(pdf.pages)
-                if hasattr(page, "page_obj") and hasattr(page.page_obj, "objid")
-            }
-
-            bookmarks = []
-            for level, title, dest, _action, _se in outlines:
-                if not title or not title.strip():
-                    continue
-
-                page_num = None
-                try:
-                    if dest and len(dest) > 0:
-                        page_ref = dest[0]
-                        if hasattr(page_ref, "objid"):
-                            page_num = objid_to_pagenum.get(page_ref.objid)
-                        elif hasattr(page_ref, "resolve"):
-                            resolved = page_ref.resolve()
-                            if hasattr(resolved, "objid"):
-                                page_num = objid_to_pagenum.get(resolved.objid)
-                except Exception:
-                    pass
-
-                bookmarks.append(
-                    {
-                        "level": min(max(level, 1), 6),
-                        "title": title.strip(),
-                        "page_num": page_num,
-                    }
-                )
-
-            return bookmarks
-
-        except Exception as e:
-            logger.warning(f"Failed to extract bookmarks: {e}")
-            return []
-
     async def _convert_local(
         self, pdf_path: Path, storage=None, resource_name: Optional[str] = None
     ) -> tuple[str, Dict[str, Any]]:
@@ -281,26 +231,41 @@ async def _convert_local(
             "pages_processed": 0,
             "images_extracted": 0,
             "tables_extracted": 0,
-            "bookmarks_extracted": 0,
+            "bookmarks_found": 0,
+            "heading_source": "none",
         }
 
         try:
             with pdfplumber.open(str(pdf_path)) as pdf:
                 meta["total_pages"] = len(pdf.pages)
 
-                # Extract bookmarks and group by page number
-                bookmarks = self._extract_bookmarks(pdf)
-                meta["bookmarks_extracted"] = len(bookmarks)
-                logger.info(f"Extracted {len(bookmarks)} bookmarks")
+                # Extract structure (bookmarks → font fallback)
+                detection_mode = self.config.heading_detection
+                bookmarks = []
+                heading_source = "none"
+
+                if detection_mode in ("bookmarks", "auto"):
+                    bookmarks = self._extract_bookmarks(pdf)
+                    if bookmarks:
+                        heading_source = "bookmarks"
+
+                if not bookmarks and detection_mode in ("font", "auto"):
+                    bookmarks = self._detect_headings_by_font(pdf)
+                    if bookmarks:
+                        heading_source = "font_analysis"
 
-                # Build a lookup: page_num -> list of bookmarks
+                meta["bookmarks_found"] = len(bookmarks)
+                meta["heading_source"] = heading_source
+                logger.info(f"Heading detection: {heading_source}, found {len(bookmarks)} headings")
+
+                # Group bookmarks by page_num
                 bookmarks_by_page = defaultdict(list)
                 for bm in bookmarks:
                     if bm["page_num"]:
                         bookmarks_by_page[bm["page_num"]].append(bm)
 
                 for page_num, page in enumerate(pdf.pages, 1):
-                    # Inject bookmark headings for this page
+                    # Inject headings before page text
                     page_bookmarks = bookmarks_by_page.get(page_num, [])
                     for bm in page_bookmarks:
                         heading_prefix = "#" * bm["level"]
@@ -361,7 +326,7 @@ async def _convert_local(
             markdown_content = "\n\n".join(parts)
             logger.info(
                 f"Local conversion: {meta['pages_processed']}/{meta['total_pages']} pages, "
-                f"{meta['bookmarks_extracted']} bookmarks, "
+                f"{meta['bookmarks_found']} bookmarks ({meta['heading_source']}), "
                 f"{meta['images_extracted']} images, {meta['tables_extracted']} tables → "
                 f"{len(markdown_content)} chars"
             )
@@ -372,6 +337,175 @@ async def _convert_local(
             logger.error(f"pdfplumber conversion failed: {e}")
             raise
 
+    def _extract_bookmarks(self, pdf) -> List[Dict[str, Any]]:
+        """Extract bookmark structure from PDF outlines.
+
+        Returns: [{level: int, title: str, page_num: int(1-based)}]
+        """
+        try:
+            if not hasattr(pdf, "doc") or not hasattr(pdf.doc, "get_outlines"):
+                return []
+
+            outlines = pdf.doc.get_outlines()
+            if not outlines:
+                return []
+
+            # Build objid → page_number mapping
+            objid_to_num = {
+                page.page_obj.objid: i + 1
+                for i, page in enumerate(pdf.pages)
+                if hasattr(page, "page_obj") and hasattr(page.page_obj, "objid")
+            }
+
+            bookmarks = []
+            for level, title, dest, _action, _se in outlines:
+                if not title or not title.strip():
+                    continue
+
+                page_num = None
+                try:
+                    if dest and len(dest) > 0:
+                        page_ref = dest[0]
+                        if hasattr(page_ref, "objid"):
+                            page_num = objid_to_num.get(page_ref.objid)
+                        elif hasattr(page_ref, "resolve"):
+                            resolved = page_ref.resolve()
+                            if hasattr(resolved, "objid"):
+                                page_num = objid_to_num.get(resolved.objid)
+                except Exception:
+                    pass
+
+                bookmarks.append(
+                    {
+                        "level": min(max(level, 1), 6),
+                        "title": title.strip(),
+                        "page_num": page_num,
+                    }
+                )
+
+            return bookmarks
+
+        except Exception as e:
+            logger.warning(f"Failed to extract bookmarks: {e}")
+            return []
+
+    def _detect_headings_by_font(self, pdf) -> List[Dict[str, Any]]:
+        """Detect headings by font size analysis.
+
+        Returns: [{level: int, title: str, page_num: int(1-based)}]
+        """
+        try:
+            # Step 1: Sample font size distribution (every 5th page)
+            size_counter: Counter = Counter()
+            sample_pages = pdf.pages[::5]
+            for page in sample_pages:
+                for char in page.chars:
+                    if char["text"].strip():
+                        rounded = round(char["size"] * 2) / 2
+                        size_counter[rounded] += 1
+
+            if not size_counter:
+                return []
+
+            # Step 2: Determine body font size and heading font sizes
+            body_size = size_counter.most_common(1)[0][0]
+            min_delta = self.config.font_heading_min_delta
+
+            heading_sizes = sorted(
+                [
+                    s
+                    for s, count in size_counter.items()
+                    if s >= body_size + min_delta and count < size_counter[body_size] * 0.3
+                ],
+                reverse=True,
+            )
+
+            max_levels = self.config.max_heading_levels
+            heading_sizes = heading_sizes[:max_levels]
+
+            if not heading_sizes:
+                logger.debug(f"Font analysis: body_size={body_size}pt, no heading sizes found")
+                return []
+
+            size_to_level = {s: i + 1 for i, s in enumerate(heading_sizes)}
+            logger.debug(
+                f"Font analysis: body_size={body_size}pt, "
+                f"heading_sizes={heading_sizes}, size_to_level={size_to_level}"
+            )
+
+            # Step 3: Extract heading text page by page
+            headings: List[Dict[str, Any]] = []
+
+            def flush_line(chars_to_flush: list, page_num: int) -> None:
+                if not chars_to_flush:
+                    return
+                title = "".join(c["text"] for c in chars_to_flush).strip()
+                size = round(chars_to_flush[0]["size"] * 2) / 2
+
+                if len(title) < 2:
+                    return
+                if len(title) > 100:
+                    return
+                if title.isdigit():
+                    return
+                if re.match(r"^[\d\s.·…]+$", title):
+                    return
+
+                headings.append(
+                    {
+                        "level": size_to_level[size],
+                        "title": title,
+                        "page_num": page_num,
+                    }
+                )
+
+            for page in pdf.pages:
+                page_num = page.page_number + 1
+                chars = sorted(page.chars, key=lambda c: (c["top"], c["x0"]))
+
+                current_line_chars: list = []
+                current_top = None
+
+                for char in chars:
+                    # Performance: headings won't appear in bottom 70% of page
+                    if char["top"] > page.height * 0.3:
+                        flush_line(current_line_chars, page_num)
+                        current_line_chars = []
+                        break
+
+                    rounded_size = round(char["size"] * 2) / 2
+                    if rounded_size not in size_to_level:
+                        flush_line(current_line_chars, page_num)
+                        current_line_chars = []
+                        current_top = None
+                        continue
+
+                    # Same line check (top offset < 2pt)
+                    if current_top is not None and abs(char["top"] - current_top) > 2:
+                        flush_line(current_line_chars, page_num)
+                        current_line_chars = []
+
+                    current_line_chars.append(char)
+                    current_top = char["top"]
+
+                flush_line(current_line_chars, page_num)
+
+            # Step 4: Deduplicate - filter headers appearing on >30% of pages
+            title_page_count: Counter = Counter(h["title"] for h in headings)
+            total_pages = len(pdf.pages)
+            header_titles = {t for t, c in title_page_count.items() if c > total_pages * 0.3}
+            headings = [h for h in headings if h["title"] not in header_titles]
+
+            logger.debug(
+                f"Font heading detection: {len(headings)} headings found "
+                f"(filtered {len(header_titles)} header titles)"
+            )
+            return headings
+
+        except Exception as e:
+            logger.warning(f"Failed to detect headings by font: {e}")
+            return []
+
     def _extract_image_from_page(self, page, img_info: dict) -> Optional[bytes]:
         """
         Extract image data from PDF page.
diff --git a/openviking_cli/utils/config/parser_config.py b/openviking_cli/utils/config/parser_config.py
index 5e52d514..f59b2fa3 100644
--- a/openviking_cli/utils/config/parser_config.py
+++ b/openviking_cli/utils/config/parser_config.py
@@ -37,9 +37,6 @@ class ParserConfig:
     max_section_size: int = 1000  # Maximum tokens per section before splitting
     section_size_flexibility: float = 0.3  # Allow 30% overflow to maintain coherence
 
-    # Directory structure configuration
-    max_children_per_dir: int = 50  # Maximum files per directory (0=unlimited)
-
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "ParserConfig":
         """
@@ -149,6 +146,11 @@ class PDFConfig(ParserConfig):
     mineru_timeout: float = 300.0  # Request timeout in seconds (5 minutes)
     mineru_params: Optional[dict] = None  # Additional API parameters
 
+    # Heading detection configuration
+    heading_detection: str = "auto"  # "bookmarks" | "font" | "auto" | "none"
+    font_heading_min_delta: float = 1.5  # Minimum font size delta from body text (pt)
+    max_heading_levels: int = 4  # Maximum heading levels for font analysis
+
     def validate(self) -> None:
         """
         Validate configuration.
@@ -172,6 +174,12 @@ def validate(self) -> None:
         if self.mineru_timeout <= 0:
             raise ValueError("mineru_timeout must be positive")
 
+        if self.heading_detection not in ("bookmarks", "font", "auto", "none"):
+            raise ValueError(f"Invalid heading_detection: {self.heading_detection}")
+
+        if self.font_heading_min_delta <= 0:
+            raise ValueError("font_heading_min_delta must be positive")
+
 
 @dataclass
 class CodeHostingConfig(ParserConfig):

From cbda51debc54e10f119f888f677360ce7ca2fa0d Mon Sep 17 00:00:00 2001
From: qin-ctx <qinhaojie.exe@bytedance.com>
Date: Wed, 4 Mar 2026 18:03:58 +0800
Subject: [PATCH 3/3] refactor: merge _process_expanded_sections back into
 _process_sections_with_merge

The separate function was only needed for the removed auto-group logic.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 openviking/parse/parsers/markdown.py | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/openviking/parse/parsers/markdown.py b/openviking/parse/parsers/markdown.py
index 10393311..fb570e30 100644
--- a/openviking/parse/parsers/markdown.py
+++ b/openviking/parse/parsers/markdown.py
@@ -449,6 +449,8 @@ async def _process_sections_with_merge(
         min_size: int,
     ) -> None:
         """Process sections with small section merge logic."""
+        viking_fs = self._get_viking_fs()
+
         # Expand section info
         expanded = [
             section
@@ -457,23 +459,6 @@ async def _process_sections_with_merge(
             for section in sections
         ]
 
-        await self._process_expanded_sections(
-            content, headings, parent_dir, expanded, parent_name, max_size, min_size
-        )
-
-    async def _process_expanded_sections(
-        self,
-        content: str,
-        headings: List[Tuple[int, int, str, int]],
-        parent_dir: str,
-        expanded: List[Dict[str, Any]],
-        parent_name: str,
-        max_size: int,
-        min_size: int,
-    ) -> None:
-        """Process a list of already-expanded sections with merge logic."""
-        viking_fs = self._get_viking_fs()
-
         pending = []
         for sec in expanded:
             name, tokens, content_text = sec["name"], sec["tokens"], sec["content"]