From d95e1cda78b9023f00b6ca0231b0ff05c935d29b Mon Sep 17 00:00:00 2001 From: qin-ctx Date: Wed, 4 Mar 2026 17:38:25 +0800 Subject: [PATCH 1/3] feat(parse): auto-group sections into subdirectories when file count exceeds limit When a document is split into many parts, automatically organize them into subdirectories to avoid having too many files in a single directory. Also refactors PDF bookmark extraction for clarity and uses defaultdict. Co-Authored-By: Claude Opus 4.6 --- openviking/parse/parsers/markdown.py | 84 +++++++++++++-- openviking/parse/parsers/pdf.py | 103 ++++++++----------- openviking_cli/utils/config/parser_config.py | 3 + 3 files changed, 123 insertions(+), 67 deletions(-) diff --git a/openviking/parse/parsers/markdown.py b/openviking/parse/parsers/markdown.py index 7301b335..110f1477 100644 --- a/openviking/parse/parsers/markdown.py +++ b/openviking/parse/parsers/markdown.py @@ -52,6 +52,7 @@ class MarkdownParser(BaseParser): DEFAULT_MAX_SECTION_SIZE = 1024 # Maximum tokens per section DEFAULT_MIN_SECTION_TOKENS = 512 # Minimum tokens to create a separate section MAX_MERGED_FILENAME_LENGTH = 32 # Maximum length for merged section filenames + MAX_CHILDREN_PER_DIR = 50 # Maximum files per directory def __init__( self, @@ -399,10 +400,24 @@ async def _parse_and_create_structure( if not headings: logger.info("[MarkdownParser] No headings, splitting by paragraphs") parts = self._smart_split_content(content, max_size) - for part_idx, part in enumerate(parts, 1): - part_file = f"{root_dir}/{doc_name}_{part_idx}.md" - await viking_fs.write_file(part_file, part) - logger.debug(f"[MarkdownParser] Split into {len(parts)} parts") + max_children = self.config.max_children_per_dir or self.MAX_CHILDREN_PER_DIR + groups = self._auto_group_sections(parts, doc_name, max_children) + + for group_idx, (subdir_name, group_parts) in enumerate(groups): + if subdir_name: + subdir_path = f"{root_dir}/{subdir_name}" + await viking_fs.mkdir(subdir_path) + base_path = subdir_path + else: + base_path = root_dir + + offset = group_idx * max_children + for part_idx, part in enumerate(group_parts, offset + 1): + await viking_fs.write_file(f"{base_path}/{doc_name}_{part_idx}.md", part) + + logger.debug( + f"[MarkdownParser] Split into {len(parts)} parts across {len(groups)} groups" + ) return # Build virtual section list (pre-heading content as first virtual section) @@ -450,8 +465,6 @@ async def _process_sections_with_merge( min_size: int, ) -> None: """Process sections with small section merge logic.""" - viking_fs = self._get_viking_fs() - # Expand section info expanded = [ section @@ -460,6 +473,39 @@ async def _process_sections_with_merge( for section in sections ] + # Auto-group when too many sibling sections into subdirectories + max_children = self.config.max_children_per_dir or self.MAX_CHILDREN_PER_DIR + if len(expanded) > max_children: + viking_fs = self._get_viking_fs() + for i in range(0, len(expanded), max_children): + chunk = expanded[i : i + max_children] + first_name = chunk[0]["name"] + last_name = chunk[-1]["name"] + subdir_name = self._sanitize_for_path(f"{first_name}_to_{last_name}") + subdir_path = f"{parent_dir}/{subdir_name}" + await viking_fs.mkdir(subdir_path, exist_ok=True) + await self._process_expanded_sections( + content, headings, subdir_path, chunk, parent_name, max_size, min_size + ) + return + + await self._process_expanded_sections( + content, headings, parent_dir, expanded, parent_name, max_size, min_size + ) + + async def _process_expanded_sections( + self, + content: str, + headings: List[Tuple[int, int, str, int]], + parent_dir: str, + expanded: List[Dict[str, Any]], + parent_name: str, + max_size: int, + min_size: int, + ) -> None: + """Process a list of already-expanded sections with merge logic.""" + viking_fs = self._get_viking_fs() + pending = [] for sec in expanded: name, tokens, content_text = sec["name"], sec["tokens"], sec["content"] @@ -689,3 +735,29 @@ def _estimate_token_count(self, content: str) -> int: cjk_chars = len(re.findall(r"[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]", content)) other_chars = len(re.findall(r"[^\s]", content)) - cjk_chars return int(cjk_chars * 0.7 + other_chars * 0.3) + + def _auto_group_sections( + self, parts: List[str], doc_name: str, group_size: int + ) -> List[Tuple[str, List[str]]]: + """Group flat parts into subdirectories. + + Args: + parts: List of content parts + doc_name: Document name + group_size: Maximum number of files per group + + Returns: + List of (subdir_name, parts) tuples. + Empty subdir_name means no subdirectory needed. + """ + if len(parts) <= group_size: + return [("", parts)] + + groups = [] + for i in range(0, len(parts), group_size): + chunk = parts[i : i + group_size] + start = i + 1 + end = i + len(chunk) + subdir = f"{doc_name}_{start:03d}-{end:03d}" + groups.append((subdir, chunk)) + return groups diff --git a/openviking/parse/parsers/pdf.py b/openviking/parse/parsers/pdf.py index cb146961..592fbe12 100644 --- a/openviking/parse/parsers/pdf.py +++ b/openviking/parse/parsers/pdf.py @@ -14,6 +14,7 @@ import logging import time +from collections import defaultdict from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -190,73 +191,54 @@ async def _convert_to_markdown(self, pdf_path: Path) -> tuple[str, Dict[str, Any def _extract_bookmarks(self, pdf) -> List[Dict[str, Any]]: """ - Extract PDF bookmarks/outlines and map them to page numbers. - - Uses pdfplumber's underlying pdfminer to access the PDF document - outline (table of contents). Each bookmark entry is mapped to a - page number by resolving its destination object. - - Args: - pdf: An open pdfplumber PDF object - - Returns: - List of dicts with keys: title, level, page_num (1-based). - Empty list if no bookmarks are found or extraction fails. + Returns: [{level: int, title: str, page_num: int(1-based)}] """ - bookmarks: List[Dict[str, Any]] = [] - try: - # Access pdfminer's document object through pdfplumber - doc = pdf.doc - if not hasattr(doc, "get_outlines"): + if not hasattr(pdf, "doc") or not hasattr(pdf.doc, "get_outlines"): return [] - # Build a mapping from pdfminer page objects to page numbers - # pdfplumber pages are 0-indexed internally - objid_to_pagenum: Dict[int, int] = {} - for i, page in enumerate(pdf.pages): - if hasattr(page, "page_obj") and hasattr(page.page_obj, "objid"): - objid_to_pagenum[page.page_obj.objid] = i + 1 # 1-based + outlines = pdf.doc.get_outlines() + if not outlines: + return [] - for level, title, dest, _a, _se in doc.get_outlines(): + # Build a mapping from pdfminer page objects to page numbers + objid_to_pagenum = { + page.page_obj.objid: i + 1 + for i, page in enumerate(pdf.pages) + if hasattr(page, "page_obj") and hasattr(page.page_obj, "objid") + } + + bookmarks = [] + for level, title, dest, _action, _se in outlines: if not title or not title.strip(): continue page_num = None - - # Resolve destination to page number - # dest can be various types depending on the PDF structure - if dest: - try: - # dest is typically a list where first element is a page reference - if isinstance(dest, (list, tuple)) and len(dest) > 0: - page_ref = dest[0] - if hasattr(page_ref, "objid"): - page_num = objid_to_pagenum.get(page_ref.objid) - elif hasattr(page_ref, "resolve"): - resolved = page_ref.resolve() - if hasattr(resolved, "objid"): - page_num = objid_to_pagenum.get(resolved.objid) - except Exception: - pass # Best-effort resolution - - # Cap heading level to 1-6 for markdown compatibility - md_level = min(max(level, 1), 6) + try: + if dest and len(dest) > 0: + page_ref = dest[0] + if hasattr(page_ref, "objid"): + page_num = objid_to_pagenum.get(page_ref.objid) + elif hasattr(page_ref, "resolve"): + resolved = page_ref.resolve() + if hasattr(resolved, "objid"): + page_num = objid_to_pagenum.get(resolved.objid) + except Exception: + pass bookmarks.append( { + "level": min(max(level, 1), 6), "title": title.strip(), - "level": md_level, - "page_num": page_num, # May be None if resolution failed + "page_num": page_num, } ) - logger.info(f"Extracted {len(bookmarks)} bookmarks from PDF outline") + return bookmarks except Exception as e: - logger.debug(f"Bookmark extraction failed (PDF may have no outlines): {e}") - - return bookmarks + logger.warning(f"Failed to extract bookmarks: {e}") + return [] async def _convert_local( self, pdf_path: Path, storage=None, resource_name: Optional[str] = None @@ -306,24 +288,23 @@ async def _convert_local( with pdfplumber.open(str(pdf_path)) as pdf: meta["total_pages"] = len(pdf.pages) - # Step 1: Extract bookmarks and group by page number + # Extract bookmarks and group by page number bookmarks = self._extract_bookmarks(pdf) meta["bookmarks_extracted"] = len(bookmarks) + logger.info(f"Extracted {len(bookmarks)} bookmarks") - # Build a lookup: page_num -> list of bookmarks to inject before that page's content - bookmarks_by_page: Dict[int, List[Dict[str, Any]]] = {} + # Build a lookup: page_num -> list of bookmarks + bookmarks_by_page = defaultdict(list) for bm in bookmarks: - pg = bm.get("page_num") - if pg is not None: - bookmarks_by_page.setdefault(pg, []).append(bm) + if bm["page_num"]: + bookmarks_by_page[bm["page_num"]].append(bm) - # Step 2: Extract content page by page, injecting bookmark headings for page_num, page in enumerate(pdf.pages, 1): - # Inject bookmark headings for this page (before page content) - if page_num in bookmarks_by_page: - for bm in bookmarks_by_page[page_num]: - heading_prefix = "#" * bm["level"] - parts.append(f"{heading_prefix} {bm['title']}") + # Inject bookmark headings for this page + page_bookmarks = bookmarks_by_page.get(page_num, []) + for bm in page_bookmarks: + heading_prefix = "#" * bm["level"] + parts.append(f"\n{heading_prefix} {bm['title']}\n") # Extract text text = page.extract_text() diff --git a/openviking_cli/utils/config/parser_config.py b/openviking_cli/utils/config/parser_config.py index abd43fed..5e52d514 100644 --- a/openviking_cli/utils/config/parser_config.py +++ b/openviking_cli/utils/config/parser_config.py @@ -37,6 +37,9 @@ class ParserConfig: max_section_size: int = 1000 # Maximum tokens per section before splitting section_size_flexibility: float = 0.3 # Allow 30% overflow to maintain coherence + # Directory structure configuration + max_children_per_dir: int = 50 # Maximum files per directory (0=unlimited) + @classmethod def from_dict(cls, data: Dict[str, Any]) -> "ParserConfig": """ From 17621251ffdb7b866ba792ed934a8e93d0ca3d4d Mon Sep 17 00:00:00 2001 From: qin-ctx Date: Wed, 4 Mar 2026 18:00:04 +0800 Subject: [PATCH 2/3] refactor: remove auto-group directory logic and translate comments to English Remove max_children_per_dir config and auto-grouping of sections into subdirectories when file count exceeds a limit. Translate all Chinese comments and docstrings to English across pdf.py and parser_config.py. Co-Authored-By: Claude Opus 4.6 --- openviking/parse/parsers/markdown.py | 64 +---- openviking/parse/parsers/pdf.py | 254 ++++++++++++++----- openviking_cli/utils/config/parser_config.py | 14 +- 3 files changed, 208 insertions(+), 124 deletions(-) diff --git a/openviking/parse/parsers/markdown.py b/openviking/parse/parsers/markdown.py index 110f1477..10393311 100644 --- a/openviking/parse/parsers/markdown.py +++ b/openviking/parse/parsers/markdown.py @@ -52,7 +52,6 @@ class MarkdownParser(BaseParser): DEFAULT_MAX_SECTION_SIZE = 1024 # Maximum tokens per section DEFAULT_MIN_SECTION_TOKENS = 512 # Minimum tokens to create a separate section MAX_MERGED_FILENAME_LENGTH = 32 # Maximum length for merged section filenames - MAX_CHILDREN_PER_DIR = 50 # Maximum files per directory def __init__( self, @@ -400,24 +399,9 @@ async def _parse_and_create_structure( if not headings: logger.info("[MarkdownParser] No headings, splitting by paragraphs") parts = self._smart_split_content(content, max_size) - max_children = self.config.max_children_per_dir or self.MAX_CHILDREN_PER_DIR - groups = self._auto_group_sections(parts, doc_name, max_children) - - for group_idx, (subdir_name, group_parts) in enumerate(groups): - if subdir_name: - subdir_path = f"{root_dir}/{subdir_name}" - await viking_fs.mkdir(subdir_path) - base_path = subdir_path - else: - base_path = root_dir - - offset = group_idx * max_children - for part_idx, part in enumerate(group_parts, offset + 1): - await viking_fs.write_file(f"{base_path}/{doc_name}_{part_idx}.md", part) - - logger.debug( - f"[MarkdownParser] Split into {len(parts)} parts across {len(groups)} groups" - ) + for part_idx, part in enumerate(parts, 1): + await viking_fs.write_file(f"{root_dir}/{doc_name}_{part_idx}.md", part) + logger.debug(f"[MarkdownParser] Split into {len(parts)} parts") return # Build virtual section list (pre-heading content as first virtual section) @@ -473,22 +457,6 @@ async def _process_sections_with_merge( for section in sections ] - # Auto-group when too many sibling sections into subdirectories - max_children = self.config.max_children_per_dir or self.MAX_CHILDREN_PER_DIR - if len(expanded) > max_children: - viking_fs = self._get_viking_fs() - for i in range(0, len(expanded), max_children): - chunk = expanded[i : i + max_children] - first_name = chunk[0]["name"] - last_name = chunk[-1]["name"] - subdir_name = self._sanitize_for_path(f"{first_name}_to_{last_name}") - subdir_path = f"{parent_dir}/{subdir_name}" - await viking_fs.mkdir(subdir_path, exist_ok=True) - await self._process_expanded_sections( - content, headings, subdir_path, chunk, parent_name, max_size, min_size - ) - return - await self._process_expanded_sections( content, headings, parent_dir, expanded, parent_name, max_size, min_size ) @@ -735,29 +703,3 @@ def _estimate_token_count(self, content: str) -> int: cjk_chars = len(re.findall(r"[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]", content)) other_chars = len(re.findall(r"[^\s]", content)) - cjk_chars return int(cjk_chars * 0.7 + other_chars * 0.3) - - def _auto_group_sections( - self, parts: List[str], doc_name: str, group_size: int - ) -> List[Tuple[str, List[str]]]: - """Group flat parts into subdirectories. - - Args: - parts: List of content parts - doc_name: Document name - group_size: Maximum number of files per group - - Returns: - List of (subdir_name, parts) tuples. - Empty subdir_name means no subdirectory needed. - """ - if len(parts) <= group_size: - return [("", parts)] - - groups = [] - for i in range(0, len(parts), group_size): - chunk = parts[i : i + group_size] - start = i + 1 - end = i + len(chunk) - subdir = f"{doc_name}_{start:03d}-{end:03d}" - groups.append((subdir, chunk)) - return groups diff --git a/openviking/parse/parsers/pdf.py b/openviking/parse/parsers/pdf.py index 592fbe12..9fb19743 100644 --- a/openviking/parse/parsers/pdf.py +++ b/openviking/parse/parsers/pdf.py @@ -13,8 +13,9 @@ """ import logging +import re import time -from collections import defaultdict +from collections import Counter, defaultdict from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -189,57 +190,6 @@ async def _convert_to_markdown(self, pdf_path: Path) -> tuple[str, Dict[str, Any else: raise ValueError(f"Unknown strategy: {self.config.strategy}") - def _extract_bookmarks(self, pdf) -> List[Dict[str, Any]]: - """ - Returns: [{level: int, title: str, page_num: int(1-based)}] - """ - try: - if not hasattr(pdf, "doc") or not hasattr(pdf.doc, "get_outlines"): - return [] - - outlines = pdf.doc.get_outlines() - if not outlines: - return [] - - # Build a mapping from pdfminer page objects to page numbers - objid_to_pagenum = { - page.page_obj.objid: i + 1 - for i, page in enumerate(pdf.pages) - if hasattr(page, "page_obj") and hasattr(page.page_obj, "objid") - } - - bookmarks = [] - for level, title, dest, _action, _se in outlines: - if not title or not title.strip(): - continue - - page_num = None - try: - if dest and len(dest) > 0: - page_ref = dest[0] - if hasattr(page_ref, "objid"): - page_num = objid_to_pagenum.get(page_ref.objid) - elif hasattr(page_ref, "resolve"): - resolved = page_ref.resolve() - if hasattr(resolved, "objid"): - page_num = objid_to_pagenum.get(resolved.objid) - except Exception: - pass - - bookmarks.append( - { - "level": min(max(level, 1), 6), - "title": title.strip(), - "page_num": page_num, - } - ) - - return bookmarks - - except Exception as e: - logger.warning(f"Failed to extract bookmarks: {e}") - return [] - async def _convert_local( self, pdf_path: Path, storage=None, resource_name: Optional[str] = None ) -> tuple[str, Dict[str, Any]]: @@ -281,26 +231,41 @@ async def _convert_local( "pages_processed": 0, "images_extracted": 0, "tables_extracted": 0, - "bookmarks_extracted": 0, + "bookmarks_found": 0, + "heading_source": "none", } try: with pdfplumber.open(str(pdf_path)) as pdf: meta["total_pages"] = len(pdf.pages) - # Extract bookmarks and group by page number - bookmarks = self._extract_bookmarks(pdf) - meta["bookmarks_extracted"] = len(bookmarks) - logger.info(f"Extracted {len(bookmarks)} bookmarks") + # Extract structure (bookmarks → font fallback) + detection_mode = self.config.heading_detection + bookmarks = [] + heading_source = "none" + + if detection_mode in ("bookmarks", "auto"): + bookmarks = self._extract_bookmarks(pdf) + if bookmarks: + heading_source = "bookmarks" + + if not bookmarks and detection_mode in ("font", "auto"): + bookmarks = self._detect_headings_by_font(pdf) + if bookmarks: + heading_source = "font_analysis" - # Build a lookup: page_num -> list of bookmarks + meta["bookmarks_found"] = len(bookmarks) + meta["heading_source"] = heading_source + logger.info(f"Heading detection: {heading_source}, found {len(bookmarks)} headings") + + # Group bookmarks by page_num bookmarks_by_page = defaultdict(list) for bm in bookmarks: if bm["page_num"]: bookmarks_by_page[bm["page_num"]].append(bm) for page_num, page in enumerate(pdf.pages, 1): - # Inject bookmark headings for this page + # Inject headings before page text page_bookmarks = bookmarks_by_page.get(page_num, []) for bm in page_bookmarks: heading_prefix = "#" * bm["level"] @@ -361,7 +326,7 @@ async def _convert_local( markdown_content = "\n\n".join(parts) logger.info( f"Local conversion: {meta['pages_processed']}/{meta['total_pages']} pages, " - f"{meta['bookmarks_extracted']} bookmarks, " + f"{meta['bookmarks_found']} bookmarks ({meta['heading_source']}), " f"{meta['images_extracted']} images, {meta['tables_extracted']} tables → " f"{len(markdown_content)} chars" ) @@ -372,6 +337,175 @@ async def _convert_local( logger.error(f"pdfplumber conversion failed: {e}") raise + def _extract_bookmarks(self, pdf) -> List[Dict[str, Any]]: + """Extract bookmark structure from PDF outlines. + + Returns: [{level: int, title: str, page_num: int(1-based)}] + """ + try: + if not hasattr(pdf, "doc") or not hasattr(pdf.doc, "get_outlines"): + return [] + + outlines = pdf.doc.get_outlines() + if not outlines: + return [] + + # Build objid → page_number mapping + objid_to_num = { + page.page_obj.objid: i + 1 + for i, page in enumerate(pdf.pages) + if hasattr(page, "page_obj") and hasattr(page.page_obj, "objid") + } + + bookmarks = [] + for level, title, dest, _action, _se in outlines: + if not title or not title.strip(): + continue + + page_num = None + try: + if dest and len(dest) > 0: + page_ref = dest[0] + if hasattr(page_ref, "objid"): + page_num = objid_to_num.get(page_ref.objid) + elif hasattr(page_ref, "resolve"): + resolved = page_ref.resolve() + if hasattr(resolved, "objid"): + page_num = objid_to_num.get(resolved.objid) + except Exception: + pass + + bookmarks.append( + { + "level": min(max(level, 1), 6), + "title": title.strip(), + "page_num": page_num, + } + ) + + return bookmarks + + except Exception as e: + logger.warning(f"Failed to extract bookmarks: {e}") + return [] + + def _detect_headings_by_font(self, pdf) -> List[Dict[str, Any]]: + """Detect headings by font size analysis. + + Returns: [{level: int, title: str, page_num: int(1-based)}] + """ + try: + # Step 1: Sample font size distribution (every 5th page) + size_counter: Counter = Counter() + sample_pages = pdf.pages[::5] + for page in sample_pages: + for char in page.chars: + if char["text"].strip(): + rounded = round(char["size"] * 2) / 2 + size_counter[rounded] += 1 + + if not size_counter: + return [] + + # Step 2: Determine body font size and heading font sizes + body_size = size_counter.most_common(1)[0][0] + min_delta = self.config.font_heading_min_delta + + heading_sizes = sorted( + [ + s + for s, count in size_counter.items() + if s >= body_size + min_delta and count < size_counter[body_size] * 0.3 + ], + reverse=True, + ) + + max_levels = self.config.max_heading_levels + heading_sizes = heading_sizes[:max_levels] + + if not heading_sizes: + logger.debug(f"Font analysis: body_size={body_size}pt, no heading sizes found") + return [] + + size_to_level = {s: i + 1 for i, s in enumerate(heading_sizes)} + logger.debug( + f"Font analysis: body_size={body_size}pt, " + f"heading_sizes={heading_sizes}, size_to_level={size_to_level}" + ) + + # Step 3: Extract heading text page by page + headings: List[Dict[str, Any]] = [] + + def flush_line(chars_to_flush: list, page_num: int) -> None: + if not chars_to_flush: + return + title = "".join(c["text"] for c in chars_to_flush).strip() + size = round(chars_to_flush[0]["size"] * 2) / 2 + + if len(title) < 2: + return + if len(title) > 100: + return + if title.isdigit(): + return + if re.match(r"^[\d\s.·…]+$", title): + return + + headings.append( + { + "level": size_to_level[size], + "title": title, + "page_num": page_num, + } + ) + + for page in pdf.pages: + page_num = page.page_number + 1 + chars = sorted(page.chars, key=lambda c: (c["top"], c["x0"])) + + current_line_chars: list = [] + current_top = None + + for char in chars: + # Performance: headings won't appear in bottom 70% of page + if char["top"] > page.height * 0.3: + flush_line(current_line_chars, page_num) + current_line_chars = [] + break + + rounded_size = round(char["size"] * 2) / 2 + if rounded_size not in size_to_level: + flush_line(current_line_chars, page_num) + current_line_chars = [] + current_top = None + continue + + # Same line check (top offset < 2pt) + if current_top is not None and abs(char["top"] - current_top) > 2: + flush_line(current_line_chars, page_num) + current_line_chars = [] + + current_line_chars.append(char) + current_top = char["top"] + + flush_line(current_line_chars, page_num) + + # Step 4: Deduplicate - filter headers appearing on >30% of pages + title_page_count: Counter = Counter(h["title"] for h in headings) + total_pages = len(pdf.pages) + header_titles = {t for t, c in title_page_count.items() if c > total_pages * 0.3} + headings = [h for h in headings if h["title"] not in header_titles] + + logger.debug( + f"Font heading detection: {len(headings)} headings found " + f"(filtered {len(header_titles)} header titles)" + ) + return headings + + except Exception as e: + logger.warning(f"Failed to detect headings by font: {e}") + return [] + def _extract_image_from_page(self, page, img_info: dict) -> Optional[bytes]: """ Extract image data from PDF page. diff --git a/openviking_cli/utils/config/parser_config.py b/openviking_cli/utils/config/parser_config.py index 5e52d514..f59b2fa3 100644 --- a/openviking_cli/utils/config/parser_config.py +++ b/openviking_cli/utils/config/parser_config.py @@ -37,9 +37,6 @@ class ParserConfig: max_section_size: int = 1000 # Maximum tokens per section before splitting section_size_flexibility: float = 0.3 # Allow 30% overflow to maintain coherence - # Directory structure configuration - max_children_per_dir: int = 50 # Maximum files per directory (0=unlimited) - @classmethod def from_dict(cls, data: Dict[str, Any]) -> "ParserConfig": """ @@ -149,6 +146,11 @@ class PDFConfig(ParserConfig): mineru_timeout: float = 300.0 # Request timeout in seconds (5 minutes) mineru_params: Optional[dict] = None # Additional API parameters + # Heading detection configuration + heading_detection: str = "auto" # "bookmarks" | "font" | "auto" | "none" + font_heading_min_delta: float = 1.5 # Minimum font size delta from body text (pt) + max_heading_levels: int = 4 # Maximum heading levels for font analysis + def validate(self) -> None: """ Validate configuration. @@ -172,6 +174,12 @@ def validate(self) -> None: if self.mineru_timeout <= 0: raise ValueError("mineru_timeout must be positive") + if self.heading_detection not in ("bookmarks", "font", "auto", "none"): + raise ValueError(f"Invalid heading_detection: {self.heading_detection}") + + if self.font_heading_min_delta <= 0: + raise ValueError("font_heading_min_delta must be positive") + @dataclass class CodeHostingConfig(ParserConfig): From cbda51debc54e10f119f888f677360ce7ca2fa0d Mon Sep 17 00:00:00 2001 From: qin-ctx Date: Wed, 4 Mar 2026 18:03:58 +0800 Subject: [PATCH 3/3] refactor: merge _process_expanded_sections back into _process_sections_with_merge The separate function was only needed for the removed auto-group logic. Co-Authored-By: Claude Opus 4.6 --- openviking/parse/parsers/markdown.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/openviking/parse/parsers/markdown.py b/openviking/parse/parsers/markdown.py index 10393311..fb570e30 100644 --- a/openviking/parse/parsers/markdown.py +++ b/openviking/parse/parsers/markdown.py @@ -449,6 +449,8 @@ async def _process_sections_with_merge( min_size: int, ) -> None: """Process sections with small section merge logic.""" + viking_fs = self._get_viking_fs() + # Expand section info expanded = [ section @@ -457,23 +459,6 @@ async def _process_sections_with_merge( for section in sections ] - await self._process_expanded_sections( - content, headings, parent_dir, expanded, parent_name, max_size, min_size - ) - - async def _process_expanded_sections( - self, - content: str, - headings: List[Tuple[int, int, str, int]], - parent_dir: str, - expanded: List[Dict[str, Any]], - parent_name: str, - max_size: int, - min_size: int, - ) -> None: - """Process a list of already-expanded sections with merge logic.""" - viking_fs = self._get_viking_fs() - pending = [] for sec in expanded: name, tokens, content_text = sec["name"], sec["tokens"], sec["content"]