diff --git a/README.md b/README.md index cc19188f..c1ad36c5 100644 --- a/README.md +++ b/README.md @@ -266,6 +266,7 @@ Settings are initialized by `openkb init`, and stored in `.openkb/config.yaml`: model: gpt-5.4 # LLM model (any LiteLLM-supported provider) language: en # Wiki output language pageindex_threshold: 20 # PDF pages threshold for PageIndex +parser: local # Document parser: local | mineru | mistral | vlm ``` Model names use `provider/model` LiteLLM [format](https://docs.litellm.ai/docs/providers) (OpenAI models can omit the prefix): @@ -276,6 +277,50 @@ Model names use `provider/model` LiteLLM [format](https://docs.litellm.ai/docs/p | Anthropic | `anthropic/claude-sonnet-4-6` | | Gemini | `gemini/gemini-3.1-pro-preview` | +### Document parsers + +By default OpenKB extracts Markdown locally (pymupdf for PDFs, markitdown for +Office/HTML) — no extra dependencies, unchanged behavior. For higher accuracy on +complex documents you can route the file → Markdown step through an online or +self-hosted parser: + +```yaml +# .openkb/config.yaml +parser: mineru # local (default) | mineru | mistral | vlm +parsers: + mineru: + mode: cloud # cloud | self_hosted + base_url: http://localhost:8000 # required when mode is self_hosted + vlm: + model: gemini/gemini-2.5-pro # any LiteLLM vision model (Gemini, GPT-4o, Claude, …) +``` + +Install the optional dependency for your parser: + +```bash +pip install openkb[mistral] # Mistral OCR +pip install openkb[mineru] # MinerU (HTTP) +pip install openkb[parsers] # all online parsers +# vlm uses the existing LiteLLM dependency — no extra needed +``` + +Set the API key via environment variable: `MINERU_API_KEY` (MinerU cloud mode), +`MISTRAL_API_KEY`; the `vlm` parser reuses the existing `LLM_API_KEY`. Override +the parser for a single run with `openkb add --parser mistral file.pdf` +(`local | mineru | mistral | vlm`). + +Each parser handles a subset of formats — `mineru` covers PDF, Word, PPT, Excel, +and HTML; `mistral` and `vlm` cover PDF. `.md` and any unsupported format always +fall back to the local parser. + +The `vlm` parser is **text-only**: it transcribes a document's text via a vision +LLM but does **not** extract embedded figures/images. Use `mineru`, `mistral`, or +`local` if you need image extraction. + +> **Note:** Long PDFs (≥ `pageindex_threshold` pages, default 20) continue to be +> indexed with PageIndex and are **not** affected by the `parser` setting. The +> parser governs the file → Markdown step for shorter documents and non-PDF files. + ### PageIndex Integration Long documents are challenging for LLMs due to context limits, context rot, and summarization loss. diff --git a/openkb/cli.py b/openkb/cli.py index 1a2761d8..b19da6a8 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -43,6 +43,7 @@ def filter(self, record: logging.LogRecord) -> bool: from openkb.config import DEFAULT_CONFIG, load_config, save_config, load_global_config, register_kb from openkb.converter import convert_document from openkb.log import append_log +from openkb.parsers.registry import VALID_PARSERS from openkb.schema import AGENTS_MD # Suppress warnings after all imports — markitdown overrides filters at import time @@ -124,17 +125,19 @@ def _setup_llm_key(kb_dir: Path | None = None) -> None: else: litellm.api_key = api_key - # Dynamically set the provider-specific env var when possible if provider: + # Active provider is known — set only its key, so LLM_API_KEY is not + # sprayed into unrelated provider keys (e.g. MISTRAL_API_KEY, which the + # Mistral parser treats as a real Mistral credential). provider_env = f"{provider.upper()}_API_KEY" if not os.environ.get(provider_env): os.environ[provider_env] = api_key - - # Fallback: also set common provider keys so multi-provider - # configs (e.g. PageIndex Cloud) still work - for env_var in _KNOWN_PROVIDER_KEYS: - if not os.environ.get(env_var): - os.environ[env_var] = api_key + else: + # Provider couldn't be determined — fall back to setting the common + # provider keys so multi-provider configs still work. + for env_var in _KNOWN_PROVIDER_KEYS: + if not os.environ.get(env_var): + os.environ[env_var] = api_key # Supported document extensions for the `add` command SUPPORTED_EXTENSIONS = { @@ -259,7 +262,7 @@ def _clear_existing_skill_dir(kb_dir: Path, name: str) -> None: shutil.rmtree(target) -def add_single_file(file_path: Path, kb_dir: Path) -> Literal["added", "skipped", "failed"]: +def add_single_file(file_path: Path, kb_dir: Path, parser_override: str | None = None) -> Literal["added", "skipped", "failed"]: """Convert, index, and compile a single document into the knowledge base. Steps: @@ -289,7 +292,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> Literal["added", "skipped" # 2. Convert document click.echo(f"Adding: {file_path.name}") try: - result = convert_document(file_path, kb_dir) + result = convert_document(file_path, kb_dir, parser_override=parser_override) except Exception as exc: click.echo(f" [ERROR] Conversion failed: {exc}") logger.debug("Conversion traceback:", exc_info=True) @@ -575,8 +578,11 @@ def init(model, language): @cli.command() @click.argument("path") +@click.option("--parser", "parser_override", default=None, + type=click.Choice(VALID_PARSERS), + help="Override the configured parser for this run.") @click.pass_context -def add(ctx, path): +def add(ctx, path, parser_override): """Add a document or directory of documents at PATH to the knowledge base. PATH may be a local file, a local directory (which is walked @@ -600,7 +606,7 @@ def add(ctx, path): fetched = fetch_url_to_raw(path, kb_dir) if fetched is None: return - outcome = add_single_file(fetched, kb_dir) + outcome = add_single_file(fetched, kb_dir, parser_override=parser_override) # Only clean up on dedup-skip. On "failed" we keep the file so # the user can retry (e.g. transient LLM error during compile) # without re-downloading — and so they don't lose data when @@ -626,7 +632,7 @@ def add(ctx, path): click.echo(f"Found {total} supported file(s) in {path}.") for i, f in enumerate(files, 1): click.echo(f"\n[{i}/{total}] ", nl=False) - add_single_file(f, kb_dir) + add_single_file(f, kb_dir, parser_override=parser_override) else: if target.suffix.lower() not in SUPPORTED_EXTENSIONS: click.echo( @@ -634,7 +640,7 @@ def add(ctx, path): f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}" ) return - add_single_file(target, kb_dir) + add_single_file(target, kb_dir, parser_override=parser_override) def _stream_to_tty() -> bool: diff --git a/openkb/config.py b/openkb/config.py index b83e1346..dea9d482 100644 --- a/openkb/config.py +++ b/openkb/config.py @@ -9,6 +9,7 @@ "model": "gpt-5.4-mini", "language": "en", "pageindex_threshold": 20, + "parser": "local", } GLOBAL_CONFIG_DIR = Path.home() / ".config" / "openkb" diff --git a/openkb/converter.py b/openkb/converter.py index 352c22b3..2ac6abb1 100644 --- a/openkb/converter.py +++ b/openkb/converter.py @@ -7,10 +7,11 @@ from pathlib import Path import pymupdf -from markitdown import MarkItDown from openkb.config import load_config -from openkb.images import copy_relative_images, extract_base64_images, convert_pdf_with_images +from openkb.images import localize_images +from openkb.parsers import get_parser +from openkb.parsers.local import LocalParser from openkb.state import HashRegistry logger = logging.getLogger(__name__) @@ -33,16 +34,17 @@ def get_pdf_page_count(path: Path) -> int: return doc.page_count -def convert_document(src: Path, kb_dir: Path) -> ConvertResult: +def convert_document(src: Path, kb_dir: Path, parser_override: str | None = None) -> ConvertResult: """Convert a document and integrate it into the knowledge base. Steps: 1. Hash-check — skip if already known. 2. Copy source to ``raw/``. 3. If PDF and page count >= threshold → return :attr:`ConvertResult.is_long_doc`. - 4. If ``.md`` — read, process relative images, save to ``wiki/sources/``. - 5. Otherwise — run MarkItDown, extract base64 images, save to ``wiki/sources/``. - 6. Register hash in the registry. + 4. Select a parser via :func:`get_parser` (falling back to + :class:`LocalParser` for unsupported suffixes like ``.md``), parse the + file to Markdown, localize images, and save to ``wiki/sources/``. + 5. Register hash in the registry. """ # ------------------------------------------------------------------ # Load config & state @@ -84,7 +86,7 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult: return ConvertResult(raw_path=raw_dest, is_long_doc=True, file_hash=file_hash) # ------------------------------------------------------------------ - # 4/5. Convert to Markdown + # 4. Select parser, convert to Markdown, localize images # ------------------------------------------------------------------ sources_dir = kb_dir / "wiki" / "sources" sources_dir.mkdir(parents=True, exist_ok=True) @@ -93,18 +95,27 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult: doc_name = src.stem - if src.suffix.lower() == ".md": - markdown = src.read_text(encoding="utf-8") - markdown = copy_relative_images(markdown, src.parent, doc_name, images_dir) - elif src.suffix.lower() == ".pdf": - # Use pymupdf dict-mode for PDFs: text + images inline at correct positions - markdown = convert_pdf_with_images(src, doc_name, images_dir) + parser = get_parser( + config, + override=parser_override, + doc_name=doc_name, + images_dir=images_dir, + source_dir=src.parent, + ) + if not parser.supports(src.suffix): + if parser.name != "local": + logger.warning( + "Parser %r does not support %r; falling back to the local parser for %s.", + parser.name, src.suffix, src.name, + ) + parser = LocalParser(doc_name=doc_name, images_dir=images_dir, source_dir=src.parent) + + parse_result = parser.parse(src) + if parser.name == "local": + # LocalParser already persisted images and produced canonical links. + markdown = parse_result.markdown else: - # Non-PDF, non-MD: use markitdown (docx, pptx, html, etc.) - mid = MarkItDown() - result = mid.convert(str(src)) - markdown = result.text_content - markdown = extract_base64_images(markdown, doc_name, images_dir) + markdown = localize_images(parse_result.markdown, parse_result.images, doc_name, images_dir) dest_md = sources_dir / f"{doc_name}.md" dest_md.write_text(markdown, encoding="utf-8") diff --git a/openkb/images.py b/openkb/images.py index 76284148..9315a20e 100644 --- a/openkb/images.py +++ b/openkb/images.py @@ -17,6 +17,10 @@ # Matches: ![alt](relative/path) — excludes http(s):// and data: URIs _RELATIVE_RE = re.compile(r'!\[([^\]]*)\]\((?!https?://|data:)([^)]+)\)') +# Matches an image link, capturing: (prefix `![alt](` + ws)(target)(optional +# title + ws)(closing `)`). Used to rewrite links by their target's basename. +_IMG_LINK_RE = re.compile(r'(!\[[^\]]*\]\(\s*)([^)\s]+)(\s*(?:"[^"]*"|\'[^\']*\')?\s*)(\))') + # Minimum pixel dimension — skip icons, bullets, and tiny artifacts _MIN_IMAGE_DIM = 32 @@ -211,6 +215,44 @@ def extract_base64_images(markdown: str, doc_name: str, images_dir: Path) -> str return result +def localize_images( + markdown: str, + images: dict[str, bytes], + doc_name: str, + images_dir: Path, +) -> str: + """Persist parser-supplied images and normalize image links. + + 1. Write every ``images`` entry to ``images_dir`` under its basename + (``Path(filename).name``), so a name with ``/`` directory components or + an absolute path can never write outside ``images_dir``. + 2. Rewrite markdown image links whose target's basename matches a written + image to the canonical ``sources/images/{doc_name}/{basename}`` path — + this handles bare names, directory-prefixed targets (e.g. + ``images/fig.png``), and links carrying a title attribute. + 3. Localize any inline base64 images via :func:`extract_base64_images`. + + Returns the normalized markdown. + """ + images_dir.mkdir(parents=True, exist_ok=True) + safe_names: set[str] = set() + for filename, data in images.items(): + safe = Path(filename).name or "image" + (images_dir / safe).write_bytes(data) + safe_names.add(safe) + + def _rewrite(m: "re.Match[str]") -> str: + pre, target, title, close = m.group(1), m.group(2), m.group(3), m.group(4) + base = Path(target).name + if base in safe_names: + return f"{pre}sources/images/{doc_name}/{base}{title}{close}" + return m.group(0) + + result = _IMG_LINK_RE.sub(_rewrite, markdown) + result = extract_base64_images(result, doc_name, images_dir) + return result + + def copy_relative_images( markdown: str, source_dir: Path, doc_name: str, images_dir: Path ) -> str: diff --git a/openkb/parsers/__init__.py b/openkb/parsers/__init__.py new file mode 100644 index 00000000..aeeeb100 --- /dev/null +++ b/openkb/parsers/__init__.py @@ -0,0 +1,5 @@ +"""Pluggable document parsers for the file → Markdown step.""" +from openkb.parsers.base import ParseResult, Parser +from openkb.parsers.registry import get_parser + +__all__ = ["ParseResult", "Parser", "get_parser"] diff --git a/openkb/parsers/base.py b/openkb/parsers/base.py new file mode 100644 index 00000000..deb07d60 --- /dev/null +++ b/openkb/parsers/base.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from pathlib import Path + + +@dataclass +class ParseResult: + """Normalized output of a parser. + + ``markdown`` references images either as bare filenames present in + ``images`` or as inline base64 data URIs. ``images`` maps a filename to + its raw bytes; the caller persists them and rewrites links via + :func:`openkb.images.localize_images`. + """ + + markdown: str + images: dict[str, bytes] = field(default_factory=dict) + + +class Parser(ABC): + """Converts a source document to Markdown.""" + + name: str + + @abstractmethod + def supports(self, suffix: str) -> bool: + """Return True if this parser handles files with ``suffix`` (e.g. ``.pdf``).""" + + @abstractmethod + def parse(self, src: Path) -> ParseResult: + """Parse ``src`` and return a :class:`ParseResult`.""" diff --git a/openkb/parsers/local.py b/openkb/parsers/local.py new file mode 100644 index 00000000..d714d0ce --- /dev/null +++ b/openkb/parsers/local.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from pathlib import Path + +from markitdown import MarkItDown + +from openkb.images import ( + convert_pdf_with_images, + copy_relative_images, + extract_base64_images, +) +from openkb.parsers.base import ParseResult, Parser + +_LOCAL_EXTENSIONS = { + ".pdf", ".md", ".markdown", ".docx", ".pptx", ".xlsx", ".xls", + ".html", ".htm", ".txt", ".csv", +} + + +class LocalParser(Parser): + """Default parser: pymupdf for PDF, markitdown for office/html, direct read for md.""" + + name = "local" + + def __init__(self, doc_name: str = "", images_dir: Path | None = None, + source_dir: Path | None = None): + self.doc_name = doc_name + self.images_dir = images_dir + self.source_dir = source_dir + + def supports(self, suffix: str) -> bool: + return suffix.lower() in _LOCAL_EXTENSIONS + + def parse(self, src: Path) -> ParseResult: + suffix = src.suffix.lower() + if suffix in {".md", ".markdown"}: + markdown = src.read_text(encoding="utf-8") + markdown = copy_relative_images( + markdown, src.parent, self.doc_name, self.images_dir + ) + elif suffix == ".pdf": + markdown = convert_pdf_with_images(src, self.doc_name, self.images_dir) + else: + mid = MarkItDown() + markdown = mid.convert(str(src)).text_content + markdown = extract_base64_images(markdown, self.doc_name, self.images_dir) + return ParseResult(markdown=markdown) diff --git a/openkb/parsers/mineru.py b/openkb/parsers/mineru.py new file mode 100644 index 00000000..e9356e19 --- /dev/null +++ b/openkb/parsers/mineru.py @@ -0,0 +1,147 @@ +from __future__ import annotations + +import io +import logging +import os +import time +import zipfile +from pathlib import Path +from typing import Any + +from openkb.parsers.base import ParseResult, Parser + +logger = logging.getLogger(__name__) + +_SUPPORTED = {".pdf", ".docx", ".pptx", ".xlsx", ".xls", ".html", ".htm"} +_CLOUD_BASE = "https://mineru.net/api/v4" + + +def _httpx(): + try: + import httpx + except ImportError as exc: + raise RuntimeError( + "MinerU parser requires 'httpx'. Install with: pip install openkb[mineru]" + ) from exc + return httpx + + +def _result_from_zip(zip_bytes: bytes) -> ParseResult: + """Extract the markdown file + images from a MinerU result zip.""" + images: dict[str, bytes] = {} + markdown = "" + with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf: + names = zf.namelist() + md_names = sorted(n for n in names if n.lower().endswith(".md")) + if md_names: + chosen = next((n for n in md_names if Path(n).name == "full.md"), md_names[0]) + markdown = zf.read(chosen).decode("utf-8", errors="replace") + for name in names: + if name.lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".webp")): + base = Path(name).name + if base in images: + logger.warning( + "MinerU result has multiple images named %r in different " + "folders; keeping the last. Earlier one may be lost.", base + ) + images[base] = zf.read(name) + return ParseResult(markdown=markdown, images=images) + + +def _mineru_body(resp): + """Return the 'data' dict from a MinerU v4 JSON response, raising on API errors.""" + body = resp.json() + code = body.get("code") + if code not in (0, None): + raise RuntimeError(f"MinerU API error (code={code}): {body.get('msg')}") + return body.get("data") or {} + + +class MineruParser(Parser): + """MinerU via HTTP — self-hosted server or hosted cloud API.""" + + name = "mineru" + + def __init__(self, opts: dict[str, Any] | None = None): + self.opts = opts or {} + self.mode = self.opts.get("mode", "cloud") + self.base_url = self.opts.get("base_url") + pi = self.opts.get("poll_interval", 3) + self.poll_interval = pi if isinstance(pi, (int, float)) and pi > 0 else 3 + t = self.opts.get("timeout", 600) + self.timeout = t if isinstance(t, (int, float)) and t > 0 else 600 + + def supports(self, suffix: str) -> bool: + return suffix.lower() in _SUPPORTED + + def parse(self, src: Path) -> ParseResult: + if self.mode == "self_hosted": + return self._parse_self_hosted(src) + return self._parse_cloud(src) + + def _parse_self_hosted(self, src: Path) -> ParseResult: + if not self.base_url: + raise RuntimeError( + "MinerU self_hosted mode requires 'base_url' in parsers.mineru config." + ) + httpx = _httpx() + url = self.base_url.rstrip("/") + "/file_parse" + with httpx.Client(timeout=self.timeout) as client: + resp = client.post( + url, + files={"file": (src.name, src.read_bytes())}, + data={"return_format": "zip"}, + ) + resp.raise_for_status() + return _result_from_zip(resp.content) + + def _parse_cloud(self, src: Path) -> ParseResult: + api_key = os.environ.get("MINERU_API_KEY") + if not api_key: + raise RuntimeError( + "MinerU cloud mode requires the MINERU_API_KEY environment variable." + ) + httpx = _httpx() + headers = {"Authorization": f"Bearer {api_key}"} + with httpx.Client(timeout=min(self.timeout, 120)) as client: + r = client.post( + f"{_CLOUD_BASE}/file-urls/batch", + headers=headers, + json={"files": [{"name": src.name, "is_ocr": True}]}, + ) + r.raise_for_status() + data = _mineru_body(r) + batch_id = data.get("batch_id") + file_urls = data.get("file_urls") or [] + if not batch_id or not file_urls: + raise RuntimeError(f"MinerU returned no upload URL: {data}") + upload_url = file_urls[0] + client.put(upload_url, content=src.read_bytes()).raise_for_status() + deadline = time.monotonic() + self.timeout + zip_url = None + while time.monotonic() < deadline: + pr = client.get( + f"{_CLOUD_BASE}/extract-results/batch/{batch_id}", headers=headers + ) + pr.raise_for_status() + data = _mineru_body(pr) + results = data.get("extract_result") or [] + if not results: + time.sleep(self.poll_interval) + continue + state = results[0].get("state") + if state == "done": + zip_url = results[0].get("full_zip_url") + if not zip_url: + raise RuntimeError( + f"MinerU reported done but no full_zip_url: {results[0]}" + ) + break + if state == "failed": + raise RuntimeError(f"MinerU extraction failed: {results[0]}") + time.sleep(self.poll_interval) + if zip_url is None: + raise RuntimeError("MinerU extraction timed out.") + zr = client.get(zip_url) + zr.raise_for_status() + return _result_from_zip(zr.content) diff --git a/openkb/parsers/mistral.py b/openkb/parsers/mistral.py new file mode 100644 index 00000000..9f5d0706 --- /dev/null +++ b/openkb/parsers/mistral.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +import base64 +import logging +import os +import re +from pathlib import Path +from typing import Any + +from openkb.parsers.base import ParseResult, Parser + +logger = logging.getLogger(__name__) + +_SUPPORTED = {".pdf"} +_DATA_URI_RE = re.compile(r"^data:[^;]+;base64,", re.IGNORECASE) + + +class MistralParser(Parser): + """Mistral OCR (Document AI). Synchronous; markdown + base64 images.""" + + name = "mistral" + + def __init__(self, opts: dict[str, Any] | None = None): + self.opts = opts or {} + self.model = self.opts.get("model", "mistral-ocr-latest") + + def supports(self, suffix: str) -> bool: + return suffix.lower() in _SUPPORTED + + def parse(self, src: Path) -> ParseResult: + api_key = os.environ.get("MISTRAL_API_KEY") + if not api_key: + raise RuntimeError( + "Mistral parser requires the MISTRAL_API_KEY environment variable." + ) + try: + from mistralai import Mistral + except ImportError as exc: + raise RuntimeError( + "Mistral parser requires the 'mistralai' package. " + "Install with: pip install openkb[mistral]" + ) from exc + + client = Mistral(api_key=api_key) + uploaded = None + try: + uploaded = client.files.upload( + file={"file_name": src.name, "content": src.read_bytes()}, purpose="ocr" + ) + signed = client.files.get_signed_url(file_id=uploaded.id) + resp = client.ocr.process( + model=self.model, + document={"type": "document_url", "document_url": signed.url}, + include_image_base64=True, + ) + + parts: list[str] = [] + images: dict[str, bytes] = {} + for page in resp.pages: + parts.append(page.markdown or "") + for img in getattr(page, "images", None) or []: + raw = img.image_base64 or "" + raw = _DATA_URI_RE.sub("", raw) + try: + images[img.id] = base64.b64decode(raw, validate=True) + except Exception: + logger.warning("Skipping undecodable Mistral image: %s", getattr(img, "id", "?")) + continue + return ParseResult(markdown="\n\n".join(parts), images=images) + finally: + if uploaded is not None: + try: + client.files.delete(file_id=uploaded.id) + except Exception: + logger.warning( + "Failed to delete uploaded Mistral OCR file %s", + getattr(uploaded, "id", "?"), + ) diff --git a/openkb/parsers/registry.py b/openkb/parsers/registry.py new file mode 100644 index 00000000..9a5a55b0 --- /dev/null +++ b/openkb/parsers/registry.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from openkb.parsers.base import Parser +from openkb.parsers.local import LocalParser + + +def _make_mistral(opts, config): + from openkb.parsers.mistral import MistralParser + return MistralParser(opts) + + +def _make_vlm(opts, config): + from openkb.parsers.vlm import VLMParser + return VLMParser(opts, model=config.get("model")) + + +def _make_mineru(opts, config): + from openkb.parsers.mineru import MineruParser + return MineruParser(opts) + + +# Single source of truth: online-parser name -> lazy factory. +_ONLINE_PARSERS = { + "mineru": _make_mineru, + "mistral": _make_mistral, + "vlm": _make_vlm, +} + +# Valid parser names (drives the CLI --parser choice and error messages). +VALID_PARSERS = ("local", *_ONLINE_PARSERS) + + +def get_parser( + config: dict[str, Any], + override: str | None = None, + *, + doc_name: str = "", + images_dir: Path | None = None, + source_dir: Path | None = None, +) -> Parser: + """Resolve the configured parser. ``override`` (e.g. CLI ``--parser``) wins.""" + name = (override or config.get("parser") or "local").lower() + if name == "local": + return LocalParser(doc_name=doc_name, images_dir=images_dir, source_dir=source_dir) + factory = _ONLINE_PARSERS.get(name) + if factory is None: + raise ValueError( + f"Unknown parser {name!r}. Valid options: {', '.join(VALID_PARSERS)}." + ) + opts = (config.get("parsers", {}) or {}).get(name, {}) or {} + return factory(opts, config) diff --git a/openkb/parsers/vlm.py b/openkb/parsers/vlm.py new file mode 100644 index 00000000..6467483f --- /dev/null +++ b/openkb/parsers/vlm.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +import logging +from pathlib import Path +from typing import Any + +from openkb.parsers.base import ParseResult, Parser +from openkb.parsers.vlm_client import transcribe_to_markdown + +logger = logging.getLogger(__name__) + +_SUPPORTED = {".pdf"} + + +class VLMParser(Parser): + """Parse via a vision-capable LLM (litellm). Covers Gemini, GPT-4o, Claude, etc.""" + + name = "vlm" + + def __init__(self, opts: dict[str, Any] | None = None, model: str | None = None): + opts = opts or {} + # parsers.vlm.model overrides the global model; else use the global model. + self.model = opts.get("model") or model + if not opts.get("model"): + logger.warning( + "VLM parser: 'parsers.vlm.model' is not set; using the global model " + "%r for vision parsing. If that model is not vision-capable, set " + "'parsers.vlm.model' to one (e.g. gemini/gemini-2.5-pro).", + self.model, + ) + + def supports(self, suffix: str) -> bool: + return suffix.lower() in _SUPPORTED + + def parse(self, src: Path) -> ParseResult: + markdown = transcribe_to_markdown(src, model=self.model) + logger.warning( + "VLM parser transcribes %s to text only; embedded figures/images are " + "not extracted. Use a parser like 'mineru' if you need figure extraction.", + src.name, + ) + return ParseResult(markdown=markdown) diff --git a/openkb/parsers/vlm_client.py b/openkb/parsers/vlm_client.py new file mode 100644 index 00000000..1f2774f8 --- /dev/null +++ b/openkb/parsers/vlm_client.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +import base64 +import mimetypes +from pathlib import Path + +import litellm + +_DEFAULT_MODEL = "gemini/gemini-2.5-pro" + +_PROMPT = ( + "Transcribe this document to clean GitHub-flavored Markdown. Preserve headings, " + "lists, tables (as Markdown or HTML tables), and math (as LaTeX). Output only the " + "Markdown content, no commentary." +) + + +def transcribe_to_markdown(src: Path, model: str | None = None, prompt: str | None = None) -> str: + """Send ``src`` (PDF or image) to a vision-capable LLM via litellm; return Markdown.""" + model = model or _DEFAULT_MODEL + mime = mimetypes.guess_type(src.name)[0] or "application/octet-stream" + b64 = base64.b64encode(src.read_bytes()).decode() + data_uri = f"data:{mime};base64,{b64}" + if mime == "application/pdf": + # litellm's document/file content part (image_url is only for raster images). + media_part = {"type": "file", "file": {"file_data": data_uri}} + else: + media_part = {"type": "image_url", "image_url": {"url": data_uri}} + content = [ + {"type": "text", "text": prompt or _PROMPT}, + media_part, + ] + resp = litellm.completion(model=model, messages=[{"role": "user", "content": content}]) + return resp.choices[0].message.content or "" diff --git a/pyproject.toml b/pyproject.toml index 026dea23..5d1e241c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,9 @@ testpaths = ["tests"] [project.optional-dependencies] dev = ["pytest", "pytest-asyncio"] +mistral = ["mistralai"] +mineru = ["httpx"] +parsers = ["mistralai", "httpx"] [tool.hatch.version] source = "vcs" diff --git a/tests/test_add_command.py b/tests/test_add_command.py index 1fb4d87f..4bdf7be1 100644 --- a/tests/test_add_command.py +++ b/tests/test_add_command.py @@ -70,7 +70,7 @@ def test_add_single_file_calls_helper(self, tmp_path): with patch("openkb.cli.add_single_file") as mock_add, \ patch("openkb.cli._find_kb_dir", return_value=kb_dir): runner.invoke(cli, ["add", str(doc)]) - mock_add.assert_called_once_with(doc, kb_dir) + mock_add.assert_called_once_with(doc, kb_dir, parser_override=None) def test_add_directory_calls_helper_for_each_file(self, tmp_path): kb_dir = self._setup_kb(tmp_path) @@ -147,3 +147,29 @@ def test_add_short_doc_runs_compiler(self, tmp_path): result = runner.invoke(cli, ["add", str(doc)]) mock_arun.assert_called_once() assert "OK" in result.output + + +def test_add_single_file_threads_parser_override(tmp_path): + from unittest.mock import patch + from pathlib import Path + from openkb.cli import add_single_file + + fake_result = type("R", (), {"skipped": True, "is_long_doc": False, + "file_hash": None, "raw_path": None, + "source_path": None})() + with patch("openkb.cli.convert_document", return_value=fake_result) as cd, \ + patch("openkb.cli._setup_llm_key"), \ + patch("openkb.cli.load_config", return_value={"model": "m"}): + add_single_file(Path("x.pdf"), tmp_path, parser_override="mistral") + # parser_override must reach convert_document + assert cd.call_args.kwargs.get("parser_override") == "mistral" \ + or (len(cd.call_args.args) >= 3 and cd.call_args.args[2] == "mistral") + + +def test_add_parser_option_rejects_invalid_choice(tmp_path): + from click.testing import CliRunner + from openkb.cli import cli + runner = CliRunner() + result = runner.invoke(cli, ["add", "--parser", "bogus", str(tmp_path / "x.pdf")]) + assert result.exit_code != 0 + assert "bogus" in result.output or "Invalid value" in result.output diff --git a/tests/test_cli.py b/tests/test_cli.py index ab3378b1..e80e272f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -365,3 +365,17 @@ async def fake_run_query(*_args, **_kwargs): assert "rnn" in saved assert "[[concepts/multi-head-attention]]" not in saved assert "multi head attention" in saved + + +def test_setup_llm_key_does_not_spray_unrelated_provider_keys(tmp_path, monkeypatch): + import os + from openkb.cli import _setup_llm_key + # KB with an openai model (known provider) + openkb_dir = tmp_path / ".openkb"; openkb_dir.mkdir() + (openkb_dir / "config.yaml").write_text("model: openai/gpt-4o\n") + for k in ("MISTRAL_API_KEY", "OPENAI_API_KEY", "ANTHROPIC_API_KEY"): + monkeypatch.delenv(k, raising=False) + monkeypatch.setenv("LLM_API_KEY", "sk-test") + _setup_llm_key(tmp_path) + assert os.environ.get("OPENAI_API_KEY") == "sk-test" # active provider set + assert os.environ.get("MISTRAL_API_KEY") is None # unrelated provider NOT sprayed diff --git a/tests/test_config.py b/tests/test_config.py index 35704a6b..0d9aae36 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -45,3 +45,8 @@ def test_load_overrides_defaults(tmp_path): assert loaded["pageindex_threshold"] == 100 # Non-overridden defaults still present assert loaded["language"] == "en" + + +def test_default_parser_is_local(): + from openkb.config import DEFAULT_CONFIG + assert DEFAULT_CONFIG["parser"] == "local" diff --git a/tests/test_converter.py b/tests/test_converter.py index d7475b09..6b5f2e41 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -85,7 +85,7 @@ def test_short_pdf_converted_via_pymupdf(self, kb_dir, tmp_path): with ( patch("openkb.converter.pymupdf.open") as mock_mu, - patch("openkb.converter.convert_pdf_with_images", return_value="# Short PDF\n\nConverted.") as mock_cpwi, + patch("openkb.parsers.local.convert_pdf_with_images", return_value="# Short PDF\n\nConverted.") as mock_cpwi, ): fake_doc = MagicMock() fake_doc.page_count = 5 # below default threshold of 20 @@ -128,3 +128,67 @@ def test_long_pdf_returns_is_long_doc(self, kb_dir, tmp_path): assert result.source_path is None assert result.skipped is False assert result.raw_path is not None + + +from openkb.parsers.base import ParseResult + + +class TestConvertDocumentParserSelection: + def test_uses_get_parser_and_localizes(self, kb_dir): + src = kb_dir / "raw" / "paper.pdf" + src.write_bytes(b"%PDF-1.4 fake") + + fake = MagicMock() + fake.supports.return_value = True + fake.parse.return_value = ParseResult(markdown="HELLO", images={"a.png": b"X"}) + + with patch("openkb.converter.get_pdf_page_count", return_value=1), \ + patch("openkb.converter.get_parser", return_value=fake) as gp, \ + patch("openkb.converter.localize_images", return_value="HELLO-LOCALIZED") as li: + result = convert_document(src, kb_dir) + + gp.assert_called_once() + assert gp.call_args.kwargs["doc_name"] == "paper" + assert gp.call_args.kwargs["images_dir"] is not None + li.assert_called_once() + assert result.source_path.read_text(encoding="utf-8") == "HELLO-LOCALIZED" + + def test_falls_back_to_local_for_unsupported_suffix(self, kb_dir): + src = kb_dir / "raw" / "notes.md" + src.write_text("# md", encoding="utf-8") + + online = MagicMock() + online.supports.return_value = False # online parser can't do .md + with patch("openkb.converter.get_parser", return_value=online), \ + patch("openkb.converter.LocalParser") as LP: + LP.return_value.parse.return_value = ParseResult(markdown="# md") + convert_document(src, kb_dir) + LP.assert_called_once() # fell back to LocalParser + + def test_local_parser_skips_redundant_localize(self, kb_dir): + src = kb_dir / "raw" / "notes.md" + src.write_text("# md", encoding="utf-8") + local = MagicMock() + local.name = "local" + local.supports.return_value = True + local.parse.return_value = ParseResult(markdown="# md final") + with patch("openkb.converter.get_parser", return_value=local), \ + patch("openkb.converter.localize_images") as li: + result = convert_document(src, kb_dir) + li.assert_not_called() # local path skips localize_images + assert result.source_path.read_text(encoding="utf-8") == "# md final" + + def test_warns_on_silent_downgrade(self, kb_dir, caplog): + import logging as _logging + src = kb_dir / "raw" / "notes.md" + src.write_text("# md", encoding="utf-8") + online = MagicMock() + online.name = "mistral" + online.supports.return_value = False + with patch("openkb.converter.get_parser", return_value=online), \ + patch("openkb.converter.LocalParser") as LP: + LP.return_value.name = "local" + LP.return_value.parse.return_value = ParseResult(markdown="# md") + with caplog.at_level(_logging.WARNING): + convert_document(src, kb_dir) + assert any("falling back to the local parser" in r.message for r in caplog.records) diff --git a/tests/test_images.py b/tests/test_images.py index 9abb3ec2..26b8ed1b 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -4,7 +4,7 @@ import base64 -from openkb.images import copy_relative_images, extract_base64_images +from openkb.images import copy_relative_images, extract_base64_images, localize_images # --------------------------------------------------------------------------- @@ -164,3 +164,83 @@ def test_multiple_relative_images_all_copied(self, tmp_path): assert "![b](sources/images/doc/b.jpg)" in result assert (images_dir / "a.png").exists() assert (images_dir / "b.jpg").exists() + + +# --------------------------------------------------------------------------- +# localize_images +# --------------------------------------------------------------------------- + + +def test_localize_images_writes_bytes_and_rewrites_bare_refs(tmp_path): + images_dir = tmp_path / "wiki" / "sources" / "images" / "doc" + md = "Before\n\n![fig](p1_img1.png)\n\nAfter" + out = localize_images(md, {"p1_img1.png": b"PNGDATA"}, "doc", images_dir) + assert "![fig](sources/images/doc/p1_img1.png)" in out + assert (images_dir / "p1_img1.png").read_bytes() == b"PNGDATA" + + +def test_localize_images_handles_inline_base64(tmp_path): + import base64 + images_dir = tmp_path / "wiki" / "sources" / "images" / "doc" + payload = base64.b64encode(b"JPEGDATA").decode() + md = f"![x](data:image/jpeg;base64,{payload})" + out = localize_images(md, {}, "doc", images_dir) + assert "sources/images/doc/img_001.jpeg" in out + assert (images_dir / "img_001.jpeg").read_bytes() == b"JPEGDATA" + + +def test_localize_images_leaves_unreferenced_bytes_on_disk(tmp_path): + images_dir = tmp_path / "wiki" / "sources" / "images" / "doc" + out = localize_images("no images here", {"orphan.png": b"X"}, "doc", images_dir) + assert out == "no images here" + assert (images_dir / "orphan.png").read_bytes() == b"X" + + +def test_localize_images_filename_with_regex_metachars(tmp_path): + images_dir = tmp_path / "wiki" / "sources" / "images" / "doc" + weird = r"img\g<9>.png" # backslash-escape-like name must not crash re.sub + md = f"![f]({weird})" + out = localize_images(md, {weird: b"DATA"}, "doc", images_dir) + assert f"sources/images/doc/{weird}" in out + assert (images_dir / weird).read_bytes() == b"DATA" + + +def test_localize_images_strips_path_traversal_in_filename(tmp_path): + images_dir = tmp_path / "wiki" / "sources" / "images" / "doc" + md = "![bad](../../evil.png)" + out = localize_images(md, {"../../evil.png": b"DATA"}, "doc", images_dir) + # bytes written INSIDE images_dir under the basename only — no escape + assert (images_dir / "evil.png").read_bytes() == b"DATA" + assert not (tmp_path / "evil.png").exists() + assert not (images_dir.parent.parent / "evil.png").exists() + # the original ref is rewritten to the sanitized canonical path + assert "sources/images/doc/evil.png" in out + + +def test_localize_images_absolute_filename_stays_inside(tmp_path): + images_dir = tmp_path / "wiki" / "sources" / "images" / "doc" + out = localize_images("![x](/etc/x.png)", {"/etc/x.png": b"D"}, "doc", images_dir) + assert (images_dir / "x.png").read_bytes() == b"D" + assert "sources/images/doc/x.png" in out + + +def test_localize_images_rewrites_directory_prefixed_target(tmp_path): + images_dir = tmp_path / "wiki" / "sources" / "images" / "doc" + md = "![p](images/fig.png)\n\n![q](./sub/images/other.png)" + out = localize_images(md, {"fig.png": b"A", "other.png": b"B"}, "doc", images_dir) + assert "![p](sources/images/doc/fig.png)" in out + assert "![q](sources/images/doc/other.png)" in out + assert (images_dir / "fig.png").read_bytes() == b"A" + assert (images_dir / "other.png").read_bytes() == b"B" + + +def test_localize_images_preserves_title_attribute(tmp_path): + images_dir = tmp_path / "wiki" / "sources" / "images" / "doc" + out = localize_images('![a](fig.png "Figure 1")', {"fig.png": b"X"}, "doc", images_dir) + assert '![a](sources/images/doc/fig.png "Figure 1")' in out + + +def test_localize_images_inner_whitespace(tmp_path): + images_dir = tmp_path / "wiki" / "sources" / "images" / "doc" + out = localize_images("![a]( fig.png )", {"fig.png": b"X"}, "doc", images_dir) + assert "sources/images/doc/fig.png" in out diff --git a/tests/test_parsers_base.py b/tests/test_parsers_base.py new file mode 100644 index 00000000..1c119a32 --- /dev/null +++ b/tests/test_parsers_base.py @@ -0,0 +1,24 @@ +"""Tests for the parser abstraction base types.""" +from __future__ import annotations + +import pytest + +from openkb.parsers.base import ParseResult, Parser + + +def test_parse_result_defaults_to_empty_images(): + pr = ParseResult(markdown="# Hi") + assert pr.markdown == "# Hi" + assert pr.images == {} + + +def test_parser_is_abstract(): + with pytest.raises(TypeError): + Parser() # cannot instantiate abstract base + + +def test_concrete_parser_must_implement_parse_and_supports(): + class Incomplete(Parser): + name = "incomplete" + with pytest.raises(TypeError): + Incomplete() diff --git a/tests/test_parsers_local.py b/tests/test_parsers_local.py new file mode 100644 index 00000000..af17ed38 --- /dev/null +++ b/tests/test_parsers_local.py @@ -0,0 +1,48 @@ +"""Tests for LocalParser — preserves legacy md/pdf/markitdown behavior.""" +from __future__ import annotations + +from pathlib import Path +from unittest.mock import patch + +from openkb.parsers.local import LocalParser +from openkb.parsers.base import ParseResult + + +def test_supports_all_known_extensions(): + p = LocalParser() + for ext in [".pdf", ".md", ".markdown", ".docx", ".pptx", ".xlsx", ".html", ".txt", ".csv"]: + assert p.supports(ext) is True + + +def test_parse_md_reads_text(tmp_path): + src = tmp_path / "n.md" + src.write_text("# Title\n\nbody", encoding="utf-8") + images_dir = tmp_path / "img" / "n" + p = LocalParser(doc_name="n", images_dir=images_dir, source_dir=tmp_path) + result = p.parse(src) + assert isinstance(result, ParseResult) + assert result.markdown.startswith("# Title") + + +def test_parse_pdf_delegates_to_convert_pdf_with_images(tmp_path): + src = tmp_path / "doc.pdf" + src.write_bytes(b"%PDF-1.4 fake") + images_dir = tmp_path / "img" / "doc" + with patch("openkb.parsers.local.convert_pdf_with_images", return_value="PDF MD") as m: + p = LocalParser(doc_name="doc", images_dir=images_dir, source_dir=tmp_path) + result = p.parse(src) + m.assert_called_once_with(src, "doc", images_dir) + assert result.markdown == "PDF MD" + + +def test_parse_other_uses_markitdown_and_extracts_base64(tmp_path): + src = tmp_path / "deck.pptx" + src.write_bytes(b"PK fake") + images_dir = tmp_path / "img" / "deck" + with patch("openkb.parsers.local.MarkItDown") as fake_mid, \ + patch("openkb.parsers.local.extract_base64_images", return_value="CLEANED") as ex: + fake_mid.return_value.convert.return_value.text_content = "MARKITDOWN MD" + p = LocalParser(doc_name="deck", images_dir=images_dir, source_dir=tmp_path) + result = p.parse(src) + ex.assert_called_once_with("MARKITDOWN MD", "deck", images_dir) + assert result.markdown == "CLEANED" diff --git a/tests/test_parsers_mineru.py b/tests/test_parsers_mineru.py new file mode 100644 index 00000000..f81c8033 --- /dev/null +++ b/tests/test_parsers_mineru.py @@ -0,0 +1,259 @@ +from __future__ import annotations + +import io +import sys +import types +import zipfile +from unittest.mock import MagicMock + +import pytest + +from openkb.parsers.base import ParseResult + + +def test_supports_office_and_pdf(): + from openkb.parsers.mineru import MineruParser + p = MineruParser({}) + assert p.supports(".pdf") is True + assert p.supports(".docx") is True + assert p.supports(".md") is False + + +def test_self_hosted_requires_base_url(tmp_path): + from openkb.parsers.mineru import MineruParser + p = MineruParser({"mode": "self_hosted"}) + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + with pytest.raises(RuntimeError) as exc: + p.parse(src) + assert "base_url" in str(exc.value) + + +def test_cloud_requires_api_key(monkeypatch, tmp_path): + monkeypatch.delenv("MINERU_API_KEY", raising=False) + from openkb.parsers.mineru import MineruParser + p = MineruParser({"mode": "cloud"}) + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + with pytest.raises(RuntimeError) as exc: + p.parse(src) + assert "MINERU_API_KEY" in str(exc.value) + + +def test_self_hosted_parses_zip(monkeypatch, tmp_path): + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr("full.md", "# Mineru\n\n![p](images/fig.png)") + zf.writestr("images/fig.png", b"PNGBYTES") + zip_bytes = buf.getvalue() + + fake_resp = MagicMock(status_code=200, content=zip_bytes) + fake_resp.raise_for_status = MagicMock() + fake_client = MagicMock() + fake_client.__enter__ = MagicMock(return_value=fake_client) + fake_client.__exit__ = MagicMock(return_value=False) + fake_client.post.return_value = fake_resp + + httpx_mod = types.ModuleType("httpx") + httpx_mod.Client = MagicMock(return_value=fake_client) + monkeypatch.setitem(sys.modules, "httpx", httpx_mod) + + from openkb.parsers.mineru import MineruParser + p = MineruParser({"mode": "self_hosted", "base_url": "http://localhost:8000"}) + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + result = p.parse(src) + assert isinstance(result, ParseResult) + assert "Mineru" in result.markdown + assert result.images["fig.png"] == b"PNGBYTES" + # _result_from_zip no longer rewrites links; the raw 'images/fig.png' survives + assert "images/fig.png" in result.markdown + # localize_images (which now rewrites by basename) canonicalizes it + from openkb.images import localize_images + md2 = localize_images(result.markdown, result.images, "d", tmp_path / "imgs") + assert "sources/images/d/fig.png" in md2 + + +def test_cloud_flow_polls_then_downloads(monkeypatch, tmp_path): + monkeypatch.setenv("MINERU_API_KEY", "key") + monkeypatch.setattr("openkb.parsers.mineru.time.sleep", lambda *a, **k: None) + + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr("full.md", "# Cloud") + zf.writestr("images/fig.png", b"ZBYTES") + zip_bytes = buf.getvalue() + + def _resp(json_data=None, content=None): + r = MagicMock() + r.raise_for_status = MagicMock() + if json_data is not None: + r.json.return_value = json_data + if content is not None: + r.content = content + return r + + client = MagicMock() + client.__enter__ = MagicMock(return_value=client) + client.__exit__ = MagicMock(return_value=False) + client.post.return_value = _resp( + json_data={"data": {"batch_id": "b1", "file_urls": ["https://upload"]}} + ) + client.put.return_value = _resp() + + poll_url = "https://mineru.net/api/v4/extract-results/batch/b1" + poll_running = _resp(json_data={"data": {"extract_result": [{"state": "running"}]}}) + poll_done = _resp( + json_data={"data": {"extract_result": [{"state": "done", "full_zip_url": "https://zip"}]}} + ) + zip_resp = _resp(content=zip_bytes) + + def _get(url, *a, **k): + if url == "https://zip": + return zip_resp + assert url == poll_url + _get.calls += 1 + return poll_running if _get.calls == 1 else poll_done + + _get.calls = 0 + client.get.side_effect = _get + + httpx_mod = types.ModuleType("httpx") + httpx_mod.Client = MagicMock(return_value=client) + monkeypatch.setitem(sys.modules, "httpx", httpx_mod) + + from openkb.parsers.mineru import MineruParser + p = MineruParser({"mode": "cloud", "poll_interval": 0}) + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + result = p.parse(src) + + assert isinstance(result, ParseResult) + assert "Cloud" in result.markdown + assert result.images["fig.png"] == b"ZBYTES" + # drove the full poll loop: running once, then done + assert _get.calls == 2 + + +def test_poll_interval_zero_is_clamped_to_positive(): + from openkb.parsers.mineru import MineruParser + assert MineruParser({"poll_interval": 0}).poll_interval > 0 + assert MineruParser({"poll_interval": -5}).poll_interval > 0 + assert MineruParser({"poll_interval": 2}).poll_interval == 2 + + +def test_result_from_zip_does_not_rewrite_links(tmp_path): + import io, zipfile + # The images/ -> bare rewrite moved OUT of _result_from_zip into + # localize_images; _result_from_zip must leave the markdown link text intact. + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr("full.md", "See path other_images/fig.png in text.\n\n![p](images/fig.png)") + zf.writestr("images/fig.png", b"PNG") + from openkb.parsers.mineru import _result_from_zip + result = _result_from_zip(buf.getvalue()) + assert "![p](images/fig.png)" in result.markdown # link text unchanged + assert "other_images/fig.png" in result.markdown # unrelated prose untouched + assert result.images["fig.png"] == b"PNG" # images keyed by basename + + +def test_cloud_empty_extract_result_then_done(monkeypatch, tmp_path): + import io, sys, types, zipfile + from unittest.mock import MagicMock + monkeypatch.setenv("MINERU_API_KEY", "key") + monkeypatch.setattr("openkb.parsers.mineru.time.sleep", lambda *a, **k: None) + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr("full.md", "# Ok") + zip_bytes = buf.getvalue() + + def _resp(json_data=None, content=None): + r = MagicMock(); r.raise_for_status = MagicMock() + if json_data is not None: r.json.return_value = json_data + if content is not None: r.content = content + return r + client = MagicMock() + client.__enter__ = MagicMock(return_value=client); client.__exit__ = MagicMock(return_value=False) + client.post.return_value = _resp(json_data={"data": {"batch_id": "b1", "file_urls": ["https://up"]}}) + client.put.return_value = _resp() + empty = _resp(json_data={"data": {"extract_result": []}}) # queued: empty list + done = _resp(json_data={"data": {"extract_result": [{"state": "done", "full_zip_url": "https://zip"}]}}) + zipr = _resp(content=zip_bytes) + def _get(url, *a, **k): + if url == "https://zip": return zipr + _get.n += 1 + return empty if _get.n == 1 else done + _get.n = 0 + client.get.side_effect = _get + httpx_mod = types.ModuleType("httpx"); httpx_mod.Client = MagicMock(return_value=client) + monkeypatch.setitem(sys.modules, "httpx", httpx_mod) + from openkb.parsers.mineru import MineruParser + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + result = MineruParser({"mode": "cloud", "poll_interval": 1}).parse(src) + assert "Ok" in result.markdown # survived the empty-list poll without crashing + + +def test_timeout_invalid_is_clamped(): + from openkb.parsers.mineru import MineruParser + assert MineruParser({"timeout": 0}).timeout == 600 + assert MineruParser({"timeout": "x"}).timeout == 600 + assert MineruParser({"timeout": 30}).timeout == 30 + + +def test_cloud_api_error_envelope_raises(monkeypatch, tmp_path): + import sys, types + from unittest.mock import MagicMock + monkeypatch.setenv("MINERU_API_KEY", "key") + r = MagicMock(); r.raise_for_status = MagicMock() + r.json.return_value = {"code": -10001, "msg": "token expired", "data": None} + client = MagicMock() + client.__enter__ = MagicMock(return_value=client); client.__exit__ = MagicMock(return_value=False) + client.post.return_value = r + httpx_mod = types.ModuleType("httpx"); httpx_mod.Client = MagicMock(return_value=client) + monkeypatch.setitem(sys.modules, "httpx", httpx_mod) + from openkb.parsers.mineru import MineruParser + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + import pytest + with pytest.raises(RuntimeError) as exc: + MineruParser({"mode": "cloud"}).parse(src) + assert "token expired" in str(exc.value) or "-10001" in str(exc.value) + + +def test_cloud_empty_file_urls_raises(monkeypatch, tmp_path): + import sys, types + from unittest.mock import MagicMock + monkeypatch.setenv("MINERU_API_KEY", "key") + r = MagicMock(); r.raise_for_status = MagicMock() + r.json.return_value = {"code": 0, "data": {"batch_id": "b1", "file_urls": []}} + client = MagicMock() + client.__enter__ = MagicMock(return_value=client); client.__exit__ = MagicMock(return_value=False) + client.post.return_value = r + httpx_mod = types.ModuleType("httpx"); httpx_mod.Client = MagicMock(return_value=client) + monkeypatch.setitem(sys.modules, "httpx", httpx_mod) + from openkb.parsers.mineru import MineruParser + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + import pytest + with pytest.raises(RuntimeError) as exc: + MineruParser({"mode": "cloud"}).parse(src) + assert "upload URL" in str(exc.value) + + +def test_full_md_basename_preferred_over_endswith(tmp_path): + import io, zipfile + from openkb.parsers.mineru import _result_from_zip + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr("careful.md", "# WRONG") # ends with 'full.md' but isn't it + zf.writestr("full.md", "# RIGHT") + result = _result_from_zip(buf.getvalue()) + assert "RIGHT" in result.markdown + assert "WRONG" not in result.markdown + + +def test_image_basename_collision_warns(tmp_path, caplog): + import io, zipfile, logging as _logging + from openkb.parsers.mineru import _result_from_zip + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr("full.md", "# x") + zf.writestr("images/fig.png", b"A") + zf.writestr("sub/fig.png", b"B") + with caplog.at_level(_logging.WARNING): + result = _result_from_zip(buf.getvalue()) + assert any("fig.png" in r.message for r in caplog.records) diff --git a/tests/test_parsers_mistral.py b/tests/test_parsers_mistral.py new file mode 100644 index 00000000..8283102d --- /dev/null +++ b/tests/test_parsers_mistral.py @@ -0,0 +1,122 @@ +from __future__ import annotations + +import base64 +import sys +import types +from unittest.mock import MagicMock + +import pytest + +from openkb.parsers.base import ParseResult + + +def _install_fake_mistralai(monkeypatch, client_instance): + mod = types.ModuleType("mistralai") + mod.Mistral = MagicMock(return_value=client_instance) + monkeypatch.setitem(sys.modules, "mistralai", mod) + return mod + + +def test_supports_pdf(): + from openkb.parsers.mistral import MistralParser + p = MistralParser({}) + assert p.supports(".pdf") is True + assert p.supports(".docx") is False + + +def test_missing_key_raises_actionable(monkeypatch, tmp_path): + monkeypatch.delenv("MISTRAL_API_KEY", raising=False) + from openkb.parsers.mistral import MistralParser + p = MistralParser({}) + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + with pytest.raises(RuntimeError) as exc: + p.parse(src) + assert "MISTRAL_API_KEY" in str(exc.value) + + +def test_parse_collects_markdown_and_decodes_images(monkeypatch, tmp_path): + monkeypatch.setenv("MISTRAL_API_KEY", "k") + img_bytes = b"IMGDATA" + img_b64 = base64.b64encode(img_bytes).decode() + + client = MagicMock() + client.files.upload.return_value = MagicMock(id="file-1") + client.files.get_signed_url.return_value = MagicMock(url="https://signed") + page = MagicMock() + page.markdown = "Text ![img-0.png](img-0.png)" + page.images = [MagicMock(id="img-0.png", image_base64=f"data:image/png;base64,{img_b64}")] + client.ocr.process.return_value = MagicMock(pages=[page]) + + _install_fake_mistralai(monkeypatch, client) + from openkb.parsers.mistral import MistralParser + p = MistralParser({}) + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + result = p.parse(src) + + assert isinstance(result, ParseResult) + assert "img-0.png" in result.markdown + assert result.images["img-0.png"] == img_bytes + + +def test_missing_package_raises_install_hint(monkeypatch, tmp_path): + monkeypatch.setenv("MISTRAL_API_KEY", "k") + monkeypatch.setitem(sys.modules, "mistralai", None) # force ImportError + from openkb.parsers.mistral import MistralParser + p = MistralParser({}) + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + with pytest.raises(RuntimeError) as exc: + p.parse(src) + assert "openkb[mistral]" in str(exc.value) + + +def test_undecodable_image_logged_and_skipped(monkeypatch, tmp_path, caplog): + import logging as _logging + monkeypatch.setenv("MISTRAL_API_KEY", "k") + client = MagicMock() + client.files.upload.return_value = MagicMock(id="file-1") + client.files.get_signed_url.return_value = MagicMock(url="https://signed") + page = MagicMock() + page.markdown = "Text ![bad.png](bad.png)" + page.images = [MagicMock(id="bad.png", image_base64="!!!not-base64!!!")] + client.ocr.process.return_value = MagicMock(pages=[page]) + _install_fake_mistralai(monkeypatch, client) + from openkb.parsers.mistral import MistralParser + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + with caplog.at_level(_logging.WARNING): + result = MistralParser({}).parse(src) + assert "bad.png" not in result.images + assert any("bad.png" in r.message for r in caplog.records) + + +def test_uploaded_file_is_deleted(monkeypatch, tmp_path): + import sys, types + from unittest.mock import MagicMock + monkeypatch.setenv("MISTRAL_API_KEY", "k") + client = MagicMock() + client.files.upload.return_value = MagicMock(id="file-1") + client.files.get_signed_url.return_value = MagicMock(url="https://signed") + client.ocr.process.return_value = MagicMock(pages=[]) + mod = types.ModuleType("mistralai"); mod.Mistral = MagicMock(return_value=client) + monkeypatch.setitem(sys.modules, "mistralai", mod) + from openkb.parsers.mistral import MistralParser + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + MistralParser({}).parse(src) + client.files.delete.assert_called_once_with(file_id="file-1") + + +def test_uploaded_file_deleted_even_on_ocr_error(monkeypatch, tmp_path): + import sys, types + from unittest.mock import MagicMock + import pytest + monkeypatch.setenv("MISTRAL_API_KEY", "k") + client = MagicMock() + client.files.upload.return_value = MagicMock(id="file-2") + client.files.get_signed_url.return_value = MagicMock(url="https://signed") + client.ocr.process.side_effect = RuntimeError("ocr boom") + mod = types.ModuleType("mistralai"); mod.Mistral = MagicMock(return_value=client) + monkeypatch.setitem(sys.modules, "mistralai", mod) + from openkb.parsers.mistral import MistralParser + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + with pytest.raises(RuntimeError): + MistralParser({}).parse(src) + client.files.delete.assert_called_once_with(file_id="file-2") diff --git a/tests/test_parsers_registry.py b/tests/test_parsers_registry.py new file mode 100644 index 00000000..612ed259 --- /dev/null +++ b/tests/test_parsers_registry.py @@ -0,0 +1,40 @@ +"""Tests for parser selection / registry.""" +from __future__ import annotations + +import pytest + +from openkb.parsers.registry import get_parser +from openkb.parsers.local import LocalParser + + +def _kwargs(): + return {"doc_name": "d", "images_dir": None, "source_dir": None} + + +def test_default_is_local(): + p = get_parser({}, **_kwargs()) + assert isinstance(p, LocalParser) + + +def test_explicit_local(): + p = get_parser({"parser": "local"}, **_kwargs()) + assert isinstance(p, LocalParser) + + +def test_override_wins_over_config(): + p = get_parser({"parser": "mistral"}, override="local", **_kwargs()) + assert isinstance(p, LocalParser) + + +def test_unknown_name_raises_with_valid_options(): + with pytest.raises(ValueError) as exc: + get_parser({"parser": "nope"}, **_kwargs()) + assert "nope" in str(exc.value) + assert "local" in str(exc.value) + + +def test_valid_parsers_matches_dispatch(): + from openkb.parsers.registry import VALID_PARSERS, _ONLINE_PARSERS + # local + every online factory key, no drift + assert set(VALID_PARSERS) == {"local", *_ONLINE_PARSERS} + assert VALID_PARSERS[0] == "local" diff --git a/tests/test_parsers_vlm.py b/tests/test_parsers_vlm.py new file mode 100644 index 00000000..b5a99400 --- /dev/null +++ b/tests/test_parsers_vlm.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +from unittest.mock import patch + +from openkb.parsers.vlm import VLMParser +from openkb.parsers.base import ParseResult + + +def test_supports_pdf_only_for_v1(): + p = VLMParser({}, model="gemini/gemini-2.5-pro") + assert p.supports(".pdf") is True + assert p.supports(".md") is False + assert p.supports(".docx") is False + + +def test_parse_calls_transcribe_with_configured_model(tmp_path): + src = tmp_path / "doc.pdf" + src.write_bytes(b"%PDF") + p = VLMParser({"model": "gpt-4o"}, model="fallback-model") + with patch("openkb.parsers.vlm.transcribe_to_markdown", return_value="# MD") as t: + result = p.parse(src) + t.assert_called_once_with(src, model="gpt-4o") + assert isinstance(result, ParseResult) + assert result.markdown == "# MD" + + +def test_parse_falls_back_to_global_model(tmp_path): + src = tmp_path / "doc.pdf" + src.write_bytes(b"%PDF") + p = VLMParser({}, model="global-model") + with patch("openkb.parsers.vlm.transcribe_to_markdown", return_value="x") as t: + p.parse(src) + t.assert_called_once_with(src, model="global-model") + + +def test_warns_when_falling_back_to_global_model(caplog): + import logging as _logging + with caplog.at_level(_logging.WARNING): + VLMParser({}, model="gpt-5.4-mini") + assert any("parsers.vlm.model" in r.message for r in caplog.records) + + +def test_no_warning_when_vlm_model_set(caplog): + import logging as _logging + with caplog.at_level(_logging.WARNING): + VLMParser({"model": "gemini/gemini-2.5-pro"}, model="gpt-5.4-mini") + assert not any("parsers.vlm.model" in r.message for r in caplog.records) + + +def test_parse_warns_text_only(tmp_path, caplog): + import logging as _logging + from unittest.mock import patch + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + p = VLMParser({"model": "gemini/gemini-2.5-pro"}) + with patch("openkb.parsers.vlm.transcribe_to_markdown", return_value="# md"): + with caplog.at_level(_logging.WARNING): + p.parse(src) + assert any("text only" in r.message for r in caplog.records) diff --git a/tests/test_parsers_vlm_client.py b/tests/test_parsers_vlm_client.py new file mode 100644 index 00000000..3703c179 --- /dev/null +++ b/tests/test_parsers_vlm_client.py @@ -0,0 +1,61 @@ +"""Tests for the reusable litellm vision client.""" +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +from openkb.parsers.vlm_client import transcribe_to_markdown + + +def _fake_response(text): + resp = MagicMock() + resp.choices = [MagicMock(message=MagicMock(content=text))] + return resp + + +def test_transcribe_pdf_sends_data_uri_and_returns_content(tmp_path): + src = tmp_path / "doc.pdf" + src.write_bytes(b"%PDF-1.4 data") + with patch("openkb.parsers.vlm_client.litellm.completion", + return_value=_fake_response("# Parsed")) as comp: + out = transcribe_to_markdown(src, model="gemini/gemini-2.5-pro") + assert out == "# Parsed" + _, kwargs = comp.call_args + assert kwargs["model"] == "gemini/gemini-2.5-pro" + content = kwargs["messages"][0]["content"] + assert any("base64" in str(part) for part in content) + + +def test_default_model_used_when_none(tmp_path): + src = tmp_path / "img.png" + src.write_bytes(b"PNG") + with patch("openkb.parsers.vlm_client.litellm.completion", + return_value=_fake_response("desc")) as comp: + transcribe_to_markdown(src, model=None) + _, kwargs = comp.call_args + assert kwargs["model"] # some non-empty default + + +def test_pdf_uses_file_content_part(tmp_path): + src = tmp_path / "doc.pdf" + src.write_bytes(b"%PDF-1.4 data") + with patch("openkb.parsers.vlm_client.litellm.completion", + return_value=_fake_response("x")) as comp: + transcribe_to_markdown(src, model="some/model") + content = comp.call_args.kwargs["messages"][0]["content"] + file_parts = [p for p in content if p.get("type") == "file"] + assert len(file_parts) == 1 + assert file_parts[0]["file"]["file_data"].startswith("data:application/pdf;base64,") + assert not any(p.get("type") == "image_url" for p in content) + + +def test_image_uses_image_url_content_part(tmp_path): + src = tmp_path / "fig.png" + src.write_bytes(b"\x89PNG\r\n") + with patch("openkb.parsers.vlm_client.litellm.completion", + return_value=_fake_response("x")) as comp: + transcribe_to_markdown(src, model="some/model") + content = comp.call_args.kwargs["messages"][0]["content"] + image_parts = [p for p in content if p.get("type") == "image_url"] + assert len(image_parts) == 1 + assert image_parts[0]["image_url"]["url"].startswith("data:image/png;base64,") + assert not any(p.get("type") == "file" for p in content)