diff --git a/README.md b/README.md
index cc19188f..c1ad36c5 100644
--- a/README.md
+++ b/README.md
@@ -266,6 +266,7 @@ Settings are initialized by `openkb init`, and stored in `.openkb/config.yaml`:
 model: gpt-5.4                   # LLM model (any LiteLLM-supported provider)
 language: en                     # Wiki output language
 pageindex_threshold: 20          # PDF pages threshold for PageIndex
+parser: local                    # Document parser: local | mineru | mistral | vlm
 ```
 
 Model names use `provider/model` LiteLLM [format](https://docs.litellm.ai/docs/providers) (OpenAI models can omit the prefix):
@@ -276,6 +277,50 @@ Model names use `provider/model` LiteLLM [format](https://docs.litellm.ai/docs/p
 | Anthropic | `anthropic/claude-sonnet-4-6` |
 | Gemini | `gemini/gemini-3.1-pro-preview` |
 
+### Document parsers
+
+By default OpenKB extracts Markdown locally (pymupdf for PDFs, markitdown for
+Office/HTML) — no extra dependencies, unchanged behavior. For higher accuracy on
+complex documents you can route the file → Markdown step through an online or
+self-hosted parser:
+
+```yaml
+# .openkb/config.yaml
+parser: mineru          # local (default) | mineru | mistral | vlm
+parsers:
+  mineru:
+    mode: cloud         # cloud | self_hosted
+    base_url: http://localhost:8000   # required when mode is self_hosted
+  vlm:
+    model: gemini/gemini-2.5-pro      # any LiteLLM vision model (Gemini, GPT-4o, Claude, …)
+```
+
+Install the optional dependency for your parser:
+
+```bash
+pip install openkb[mistral]   # Mistral OCR
+pip install openkb[mineru]    # MinerU (HTTP)
+pip install openkb[parsers]   # all online parsers
+# vlm uses the existing LiteLLM dependency — no extra needed
+```
+
+Set the API key via environment variable: `MINERU_API_KEY` (MinerU cloud mode),
+`MISTRAL_API_KEY`; the `vlm` parser reuses the existing `LLM_API_KEY`. Override
+the parser for a single run with `openkb add --parser mistral file.pdf`
+(`local | mineru | mistral | vlm`).
+
+Each parser handles a subset of formats — `mineru` covers PDF, Word, PPT, Excel,
+and HTML; `mistral` and `vlm` cover PDF. `.md` and any unsupported format always
+fall back to the local parser.
+
+The `vlm` parser is **text-only**: it transcribes a document's text via a vision
+LLM but does **not** extract embedded figures/images. Use `mineru`, `mistral`, or
+`local` if you need image extraction.
+
+> **Note:** Long PDFs (≥ `pageindex_threshold` pages, default 20) continue to be
+> indexed with PageIndex and are **not** affected by the `parser` setting. The
+> parser governs the file → Markdown step for shorter documents and non-PDF files.
+
 ### PageIndex Integration
 
 Long documents are challenging for LLMs due to context limits, context rot, and summarization loss.
diff --git a/openkb/cli.py b/openkb/cli.py
index 1a2761d8..b19da6a8 100644
--- a/openkb/cli.py
+++ b/openkb/cli.py
@@ -43,6 +43,7 @@ def filter(self, record: logging.LogRecord) -> bool:
 from openkb.config import DEFAULT_CONFIG, load_config, save_config, load_global_config, register_kb
 from openkb.converter import convert_document
 from openkb.log import append_log
+from openkb.parsers.registry import VALID_PARSERS
 from openkb.schema import AGENTS_MD
 
 # Suppress warnings after all imports — markitdown overrides filters at import time
@@ -124,17 +125,19 @@ def _setup_llm_key(kb_dir: Path | None = None) -> None:
     else:
         litellm.api_key = api_key
 
-        # Dynamically set the provider-specific env var when possible
         if provider:
+            # Active provider is known — set only its key, so LLM_API_KEY is not
+            # sprayed into unrelated provider keys (e.g. MISTRAL_API_KEY, which the
+            # Mistral parser treats as a real Mistral credential).
             provider_env = f"{provider.upper()}_API_KEY"
             if not os.environ.get(provider_env):
                 os.environ[provider_env] = api_key
-
-        # Fallback: also set common provider keys so multi-provider
-        # configs (e.g. PageIndex Cloud) still work
-        for env_var in _KNOWN_PROVIDER_KEYS:
-            if not os.environ.get(env_var):
-                os.environ[env_var] = api_key
+        else:
+            # Provider couldn't be determined — fall back to setting the common
+            # provider keys so multi-provider configs still work.
+            for env_var in _KNOWN_PROVIDER_KEYS:
+                if not os.environ.get(env_var):
+                    os.environ[env_var] = api_key
 
 # Supported document extensions for the `add` command
 SUPPORTED_EXTENSIONS = {
@@ -259,7 +262,7 @@ def _clear_existing_skill_dir(kb_dir: Path, name: str) -> None:
         shutil.rmtree(target)
 
 
-def add_single_file(file_path: Path, kb_dir: Path) -> Literal["added", "skipped", "failed"]:
+def add_single_file(file_path: Path, kb_dir: Path, parser_override: str | None = None) -> Literal["added", "skipped", "failed"]:
     """Convert, index, and compile a single document into the knowledge base.
 
     Steps:
@@ -289,7 +292,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> Literal["added", "skipped"
     # 2. Convert document
     click.echo(f"Adding: {file_path.name}")
     try:
-        result = convert_document(file_path, kb_dir)
+        result = convert_document(file_path, kb_dir, parser_override=parser_override)
     except Exception as exc:
         click.echo(f"  [ERROR] Conversion failed: {exc}")
         logger.debug("Conversion traceback:", exc_info=True)
@@ -575,8 +578,11 @@ def init(model, language):
 
 @cli.command()
 @click.argument("path")
+@click.option("--parser", "parser_override", default=None,
+              type=click.Choice(VALID_PARSERS),
+              help="Override the configured parser for this run.")
 @click.pass_context
-def add(ctx, path):
+def add(ctx, path, parser_override):
     """Add a document or directory of documents at PATH to the knowledge base.
 
     PATH may be a local file, a local directory (which is walked
@@ -600,7 +606,7 @@ def add(ctx, path):
         fetched = fetch_url_to_raw(path, kb_dir)
         if fetched is None:
             return
-        outcome = add_single_file(fetched, kb_dir)
+        outcome = add_single_file(fetched, kb_dir, parser_override=parser_override)
         # Only clean up on dedup-skip. On "failed" we keep the file so
         # the user can retry (e.g. transient LLM error during compile)
         # without re-downloading — and so they don't lose data when
@@ -626,7 +632,7 @@ def add(ctx, path):
         click.echo(f"Found {total} supported file(s) in {path}.")
         for i, f in enumerate(files, 1):
             click.echo(f"\n[{i}/{total}] ", nl=False)
-            add_single_file(f, kb_dir)
+            add_single_file(f, kb_dir, parser_override=parser_override)
     else:
         if target.suffix.lower() not in SUPPORTED_EXTENSIONS:
             click.echo(
@@ -634,7 +640,7 @@ def add(ctx, path):
                 f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
             )
             return
-        add_single_file(target, kb_dir)
+        add_single_file(target, kb_dir, parser_override=parser_override)
 
 
 def _stream_to_tty() -> bool:
diff --git a/openkb/config.py b/openkb/config.py
index b83e1346..dea9d482 100644
--- a/openkb/config.py
+++ b/openkb/config.py
@@ -9,6 +9,7 @@
     "model": "gpt-5.4-mini",
     "language": "en",
     "pageindex_threshold": 20,
+    "parser": "local",
 }
 
 GLOBAL_CONFIG_DIR = Path.home() / ".config" / "openkb"
diff --git a/openkb/converter.py b/openkb/converter.py
index 352c22b3..2ac6abb1 100644
--- a/openkb/converter.py
+++ b/openkb/converter.py
@@ -7,10 +7,11 @@
 from pathlib import Path
 
 import pymupdf
-from markitdown import MarkItDown
 
 from openkb.config import load_config
-from openkb.images import copy_relative_images, extract_base64_images, convert_pdf_with_images
+from openkb.images import localize_images
+from openkb.parsers import get_parser
+from openkb.parsers.local import LocalParser
 from openkb.state import HashRegistry
 
 logger = logging.getLogger(__name__)
@@ -33,16 +34,17 @@ def get_pdf_page_count(path: Path) -> int:
         return doc.page_count
 
 
-def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
+def convert_document(src: Path, kb_dir: Path, parser_override: str | None = None) -> ConvertResult:
     """Convert a document and integrate it into the knowledge base.
 
     Steps:
     1. Hash-check — skip if already known.
     2. Copy source to ``raw/``.
     3. If PDF and page count >= threshold → return :attr:`ConvertResult.is_long_doc`.
-    4. If ``.md`` — read, process relative images, save to ``wiki/sources/``.
-    5. Otherwise — run MarkItDown, extract base64 images, save to ``wiki/sources/``.
-    6. Register hash in the registry.
+    4. Select a parser via :func:`get_parser` (falling back to
+       :class:`LocalParser` for unsupported suffixes like ``.md``), parse the
+       file to Markdown, localize images, and save to ``wiki/sources/``.
+    5. Register hash in the registry.
     """
     # ------------------------------------------------------------------
     # Load config & state
@@ -84,7 +86,7 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
             return ConvertResult(raw_path=raw_dest, is_long_doc=True, file_hash=file_hash)
 
     # ------------------------------------------------------------------
-    # 4/5. Convert to Markdown
+    # 4. Select parser, convert to Markdown, localize images
     # ------------------------------------------------------------------
     sources_dir = kb_dir / "wiki" / "sources"
     sources_dir.mkdir(parents=True, exist_ok=True)
@@ -93,18 +95,27 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
 
     doc_name = src.stem
 
-    if src.suffix.lower() == ".md":
-        markdown = src.read_text(encoding="utf-8")
-        markdown = copy_relative_images(markdown, src.parent, doc_name, images_dir)
-    elif src.suffix.lower() == ".pdf":
-        # Use pymupdf dict-mode for PDFs: text + images inline at correct positions
-        markdown = convert_pdf_with_images(src, doc_name, images_dir)
+    parser = get_parser(
+        config,
+        override=parser_override,
+        doc_name=doc_name,
+        images_dir=images_dir,
+        source_dir=src.parent,
+    )
+    if not parser.supports(src.suffix):
+        if parser.name != "local":
+            logger.warning(
+                "Parser %r does not support %r; falling back to the local parser for %s.",
+                parser.name, src.suffix, src.name,
+            )
+        parser = LocalParser(doc_name=doc_name, images_dir=images_dir, source_dir=src.parent)
+
+    parse_result = parser.parse(src)
+    if parser.name == "local":
+        # LocalParser already persisted images and produced canonical links.
+        markdown = parse_result.markdown
     else:
-        # Non-PDF, non-MD: use markitdown (docx, pptx, html, etc.)
-        mid = MarkItDown()
-        result = mid.convert(str(src))
-        markdown = result.text_content
-        markdown = extract_base64_images(markdown, doc_name, images_dir)
+        markdown = localize_images(parse_result.markdown, parse_result.images, doc_name, images_dir)
 
     dest_md = sources_dir / f"{doc_name}.md"
     dest_md.write_text(markdown, encoding="utf-8")
diff --git a/openkb/images.py b/openkb/images.py
index 76284148..9315a20e 100644
--- a/openkb/images.py
+++ b/openkb/images.py
@@ -17,6 +17,10 @@
 # Matches: ![alt](relative/path) — excludes http(s):// and data: URIs
 _RELATIVE_RE = re.compile(r'!\[([^\]]*)\]\((?!https?://|data:)([^)]+)\)')
 
+# Matches an image link, capturing: (prefix `![alt](` + ws)(target)(optional
+# title + ws)(closing `)`). Used to rewrite links by their target's basename.
+_IMG_LINK_RE = re.compile(r'(!\[[^\]]*\]\(\s*)([^)\s]+)(\s*(?:"[^"]*"|\'[^\']*\')?\s*)(\))')
+
 
 # Minimum pixel dimension — skip icons, bullets, and tiny artifacts
 _MIN_IMAGE_DIM = 32
@@ -211,6 +215,44 @@ def extract_base64_images(markdown: str, doc_name: str, images_dir: Path) -> str
     return result
 
 
+def localize_images(
+    markdown: str,
+    images: dict[str, bytes],
+    doc_name: str,
+    images_dir: Path,
+) -> str:
+    """Persist parser-supplied images and normalize image links.
+
+    1. Write every ``images`` entry to ``images_dir`` under its basename
+       (``Path(filename).name``), so a name with ``/`` directory components or
+       an absolute path can never write outside ``images_dir``.
+    2. Rewrite markdown image links whose target's basename matches a written
+       image to the canonical ``sources/images/{doc_name}/{basename}`` path —
+       this handles bare names, directory-prefixed targets (e.g.
+       ``images/fig.png``), and links carrying a title attribute.
+    3. Localize any inline base64 images via :func:`extract_base64_images`.
+
+    Returns the normalized markdown.
+    """
+    images_dir.mkdir(parents=True, exist_ok=True)
+    safe_names: set[str] = set()
+    for filename, data in images.items():
+        safe = Path(filename).name or "image"
+        (images_dir / safe).write_bytes(data)
+        safe_names.add(safe)
+
+    def _rewrite(m: "re.Match[str]") -> str:
+        pre, target, title, close = m.group(1), m.group(2), m.group(3), m.group(4)
+        base = Path(target).name
+        if base in safe_names:
+            return f"{pre}sources/images/{doc_name}/{base}{title}{close}"
+        return m.group(0)
+
+    result = _IMG_LINK_RE.sub(_rewrite, markdown)
+    result = extract_base64_images(result, doc_name, images_dir)
+    return result
+
+
 def copy_relative_images(
     markdown: str, source_dir: Path, doc_name: str, images_dir: Path
 ) -> str:
diff --git a/openkb/parsers/__init__.py b/openkb/parsers/__init__.py
new file mode 100644
index 00000000..aeeeb100
--- /dev/null
+++ b/openkb/parsers/__init__.py
@@ -0,0 +1,5 @@
+"""Pluggable document parsers for the file → Markdown step."""
+from openkb.parsers.base import ParseResult, Parser
+from openkb.parsers.registry import get_parser
+
+__all__ = ["ParseResult", "Parser", "get_parser"]
diff --git a/openkb/parsers/base.py b/openkb/parsers/base.py
new file mode 100644
index 00000000..deb07d60
--- /dev/null
+++ b/openkb/parsers/base.py
@@ -0,0 +1,33 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from pathlib import Path
+
+
+@dataclass
+class ParseResult:
+    """Normalized output of a parser.
+
+    ``markdown`` references images either as bare filenames present in
+    ``images`` or as inline base64 data URIs. ``images`` maps a filename to
+    its raw bytes; the caller persists them and rewrites links via
+    :func:`openkb.images.localize_images`.
+    """
+
+    markdown: str
+    images: dict[str, bytes] = field(default_factory=dict)
+
+
+class Parser(ABC):
+    """Converts a source document to Markdown."""
+
+    name: str
+
+    @abstractmethod
+    def supports(self, suffix: str) -> bool:
+        """Return True if this parser handles files with ``suffix`` (e.g. ``.pdf``)."""
+
+    @abstractmethod
+    def parse(self, src: Path) -> ParseResult:
+        """Parse ``src`` and return a :class:`ParseResult`."""
diff --git a/openkb/parsers/local.py b/openkb/parsers/local.py
new file mode 100644
index 00000000..d714d0ce
--- /dev/null
+++ b/openkb/parsers/local.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from markitdown import MarkItDown
+
+from openkb.images import (
+    convert_pdf_with_images,
+    copy_relative_images,
+    extract_base64_images,
+)
+from openkb.parsers.base import ParseResult, Parser
+
+_LOCAL_EXTENSIONS = {
+    ".pdf", ".md", ".markdown", ".docx", ".pptx", ".xlsx", ".xls",
+    ".html", ".htm", ".txt", ".csv",
+}
+
+
+class LocalParser(Parser):
+    """Default parser: pymupdf for PDF, markitdown for office/html, direct read for md."""
+
+    name = "local"
+
+    def __init__(self, doc_name: str = "", images_dir: Path | None = None,
+                 source_dir: Path | None = None):
+        self.doc_name = doc_name
+        self.images_dir = images_dir
+        self.source_dir = source_dir
+
+    def supports(self, suffix: str) -> bool:
+        return suffix.lower() in _LOCAL_EXTENSIONS
+
+    def parse(self, src: Path) -> ParseResult:
+        suffix = src.suffix.lower()
+        if suffix in {".md", ".markdown"}:
+            markdown = src.read_text(encoding="utf-8")
+            markdown = copy_relative_images(
+                markdown, src.parent, self.doc_name, self.images_dir
+            )
+        elif suffix == ".pdf":
+            markdown = convert_pdf_with_images(src, self.doc_name, self.images_dir)
+        else:
+            mid = MarkItDown()
+            markdown = mid.convert(str(src)).text_content
+            markdown = extract_base64_images(markdown, self.doc_name, self.images_dir)
+        return ParseResult(markdown=markdown)
diff --git a/openkb/parsers/mineru.py b/openkb/parsers/mineru.py
new file mode 100644
index 00000000..e9356e19
--- /dev/null
+++ b/openkb/parsers/mineru.py
@@ -0,0 +1,147 @@
+from __future__ import annotations
+
+import io
+import logging
+import os
+import time
+import zipfile
+from pathlib import Path
+from typing import Any
+
+from openkb.parsers.base import ParseResult, Parser
+
+logger = logging.getLogger(__name__)
+
+_SUPPORTED = {".pdf", ".docx", ".pptx", ".xlsx", ".xls", ".html", ".htm"}
+_CLOUD_BASE = "https://mineru.net/api/v4"
+
+
+def _httpx():
+    try:
+        import httpx
+    except ImportError as exc:
+        raise RuntimeError(
+            "MinerU parser requires 'httpx'. Install with: pip install openkb[mineru]"
+        ) from exc
+    return httpx
+
+
+def _result_from_zip(zip_bytes: bytes) -> ParseResult:
+    """Extract the markdown file + images from a MinerU result zip."""
+    images: dict[str, bytes] = {}
+    markdown = ""
+    with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
+        names = zf.namelist()
+        md_names = sorted(n for n in names if n.lower().endswith(".md"))
+        if md_names:
+            chosen = next((n for n in md_names if Path(n).name == "full.md"), md_names[0])
+            markdown = zf.read(chosen).decode("utf-8", errors="replace")
+        for name in names:
+            if name.lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".webp")):
+                base = Path(name).name
+                if base in images:
+                    logger.warning(
+                        "MinerU result has multiple images named %r in different "
+                        "folders; keeping the last. Earlier one may be lost.", base
+                    )
+                images[base] = zf.read(name)
+    return ParseResult(markdown=markdown, images=images)
+
+
+def _mineru_body(resp):
+    """Return the 'data' dict from a MinerU v4 JSON response, raising on API errors."""
+    body = resp.json()
+    code = body.get("code")
+    if code not in (0, None):
+        raise RuntimeError(f"MinerU API error (code={code}): {body.get('msg')}")
+    return body.get("data") or {}
+
+
+class MineruParser(Parser):
+    """MinerU via HTTP — self-hosted server or hosted cloud API."""
+
+    name = "mineru"
+
+    def __init__(self, opts: dict[str, Any] | None = None):
+        self.opts = opts or {}
+        self.mode = self.opts.get("mode", "cloud")
+        self.base_url = self.opts.get("base_url")
+        pi = self.opts.get("poll_interval", 3)
+        self.poll_interval = pi if isinstance(pi, (int, float)) and pi > 0 else 3
+        t = self.opts.get("timeout", 600)
+        self.timeout = t if isinstance(t, (int, float)) and t > 0 else 600
+
+    def supports(self, suffix: str) -> bool:
+        return suffix.lower() in _SUPPORTED
+
+    def parse(self, src: Path) -> ParseResult:
+        if self.mode == "self_hosted":
+            return self._parse_self_hosted(src)
+        return self._parse_cloud(src)
+
+    def _parse_self_hosted(self, src: Path) -> ParseResult:
+        if not self.base_url:
+            raise RuntimeError(
+                "MinerU self_hosted mode requires 'base_url' in parsers.mineru config."
+            )
+        httpx = _httpx()
+        url = self.base_url.rstrip("/") + "/file_parse"
+        with httpx.Client(timeout=self.timeout) as client:
+            resp = client.post(
+                url,
+                files={"file": (src.name, src.read_bytes())},
+                data={"return_format": "zip"},
+            )
+            resp.raise_for_status()
+            return _result_from_zip(resp.content)
+
+    def _parse_cloud(self, src: Path) -> ParseResult:
+        api_key = os.environ.get("MINERU_API_KEY")
+        if not api_key:
+            raise RuntimeError(
+                "MinerU cloud mode requires the MINERU_API_KEY environment variable."
+            )
+        httpx = _httpx()
+        headers = {"Authorization": f"Bearer {api_key}"}
+        with httpx.Client(timeout=min(self.timeout, 120)) as client:
+            r = client.post(
+                f"{_CLOUD_BASE}/file-urls/batch",
+                headers=headers,
+                json={"files": [{"name": src.name, "is_ocr": True}]},
+            )
+            r.raise_for_status()
+            data = _mineru_body(r)
+            batch_id = data.get("batch_id")
+            file_urls = data.get("file_urls") or []
+            if not batch_id or not file_urls:
+                raise RuntimeError(f"MinerU returned no upload URL: {data}")
+            upload_url = file_urls[0]
+            client.put(upload_url, content=src.read_bytes()).raise_for_status()
+            deadline = time.monotonic() + self.timeout
+            zip_url = None
+            while time.monotonic() < deadline:
+                pr = client.get(
+                    f"{_CLOUD_BASE}/extract-results/batch/{batch_id}", headers=headers
+                )
+                pr.raise_for_status()
+                data = _mineru_body(pr)
+                results = data.get("extract_result") or []
+                if not results:
+                    time.sleep(self.poll_interval)
+                    continue
+                state = results[0].get("state")
+                if state == "done":
+                    zip_url = results[0].get("full_zip_url")
+                    if not zip_url:
+                        raise RuntimeError(
+                            f"MinerU reported done but no full_zip_url: {results[0]}"
+                        )
+                    break
+                if state == "failed":
+                    raise RuntimeError(f"MinerU extraction failed: {results[0]}")
+                time.sleep(self.poll_interval)
+            if zip_url is None:
+                raise RuntimeError("MinerU extraction timed out.")
+            zr = client.get(zip_url)
+            zr.raise_for_status()
+            return _result_from_zip(zr.content)
diff --git a/openkb/parsers/mistral.py b/openkb/parsers/mistral.py
new file mode 100644
index 00000000..9f5d0706
--- /dev/null
+++ b/openkb/parsers/mistral.py
@@ -0,0 +1,78 @@
+from __future__ import annotations
+
+import base64
+import logging
+import os
+import re
+from pathlib import Path
+from typing import Any
+
+from openkb.parsers.base import ParseResult, Parser
+
+logger = logging.getLogger(__name__)
+
+_SUPPORTED = {".pdf"}
+_DATA_URI_RE = re.compile(r"^data:[^;]+;base64,", re.IGNORECASE)
+
+
+class MistralParser(Parser):
+    """Mistral OCR (Document AI). Synchronous; markdown + base64 images."""
+
+    name = "mistral"
+
+    def __init__(self, opts: dict[str, Any] | None = None):
+        self.opts = opts or {}
+        self.model = self.opts.get("model", "mistral-ocr-latest")
+
+    def supports(self, suffix: str) -> bool:
+        return suffix.lower() in _SUPPORTED
+
+    def parse(self, src: Path) -> ParseResult:
+        api_key = os.environ.get("MISTRAL_API_KEY")
+        if not api_key:
+            raise RuntimeError(
+                "Mistral parser requires the MISTRAL_API_KEY environment variable."
+            )
+        try:
+            from mistralai import Mistral
+        except ImportError as exc:
+            raise RuntimeError(
+                "Mistral parser requires the 'mistralai' package. "
+                "Install with: pip install openkb[mistral]"
+            ) from exc
+
+        client = Mistral(api_key=api_key)
+        uploaded = None
+        try:
+            uploaded = client.files.upload(
+                file={"file_name": src.name, "content": src.read_bytes()}, purpose="ocr"
+            )
+            signed = client.files.get_signed_url(file_id=uploaded.id)
+            resp = client.ocr.process(
+                model=self.model,
+                document={"type": "document_url", "document_url": signed.url},
+                include_image_base64=True,
+            )
+
+            parts: list[str] = []
+            images: dict[str, bytes] = {}
+            for page in resp.pages:
+                parts.append(page.markdown or "")
+                for img in getattr(page, "images", None) or []:
+                    raw = img.image_base64 or ""
+                    raw = _DATA_URI_RE.sub("", raw)
+                    try:
+                        images[img.id] = base64.b64decode(raw, validate=True)
+                    except Exception:
+                        logger.warning("Skipping undecodable Mistral image: %s", getattr(img, "id", "?"))
+                        continue
+            return ParseResult(markdown="\n\n".join(parts), images=images)
+        finally:
+            if uploaded is not None:
+                try:
+                    client.files.delete(file_id=uploaded.id)
+                except Exception:
+                    logger.warning(
+                        "Failed to delete uploaded Mistral OCR file %s",
+                        getattr(uploaded, "id", "?"),
+                    )
diff --git a/openkb/parsers/registry.py b/openkb/parsers/registry.py
new file mode 100644
index 00000000..9a5a55b0
--- /dev/null
+++ b/openkb/parsers/registry.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from openkb.parsers.base import Parser
+from openkb.parsers.local import LocalParser
+
+
+def _make_mistral(opts, config):
+    from openkb.parsers.mistral import MistralParser
+    return MistralParser(opts)
+
+
+def _make_vlm(opts, config):
+    from openkb.parsers.vlm import VLMParser
+    return VLMParser(opts, model=config.get("model"))
+
+
+def _make_mineru(opts, config):
+    from openkb.parsers.mineru import MineruParser
+    return MineruParser(opts)
+
+
+# Single source of truth: online-parser name -> lazy factory.
+_ONLINE_PARSERS = {
+    "mineru": _make_mineru,
+    "mistral": _make_mistral,
+    "vlm": _make_vlm,
+}
+
+# Valid parser names (drives the CLI --parser choice and error messages).
+VALID_PARSERS = ("local", *_ONLINE_PARSERS)
+
+
+def get_parser(
+    config: dict[str, Any],
+    override: str | None = None,
+    *,
+    doc_name: str = "",
+    images_dir: Path | None = None,
+    source_dir: Path | None = None,
+) -> Parser:
+    """Resolve the configured parser. ``override`` (e.g. CLI ``--parser``) wins."""
+    name = (override or config.get("parser") or "local").lower()
+    if name == "local":
+        return LocalParser(doc_name=doc_name, images_dir=images_dir, source_dir=source_dir)
+    factory = _ONLINE_PARSERS.get(name)
+    if factory is None:
+        raise ValueError(
+            f"Unknown parser {name!r}. Valid options: {', '.join(VALID_PARSERS)}."
+        )
+    opts = (config.get("parsers", {}) or {}).get(name, {}) or {}
+    return factory(opts, config)
diff --git a/openkb/parsers/vlm.py b/openkb/parsers/vlm.py
new file mode 100644
index 00000000..6467483f
--- /dev/null
+++ b/openkb/parsers/vlm.py
@@ -0,0 +1,42 @@
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+from typing import Any
+
+from openkb.parsers.base import ParseResult, Parser
+from openkb.parsers.vlm_client import transcribe_to_markdown
+
+logger = logging.getLogger(__name__)
+
+_SUPPORTED = {".pdf"}
+
+
+class VLMParser(Parser):
+    """Parse via a vision-capable LLM (litellm). Covers Gemini, GPT-4o, Claude, etc."""
+
+    name = "vlm"
+
+    def __init__(self, opts: dict[str, Any] | None = None, model: str | None = None):
+        opts = opts or {}
+        # parsers.vlm.model overrides the global model; else use the global model.
+        self.model = opts.get("model") or model
+        if not opts.get("model"):
+            logger.warning(
+                "VLM parser: 'parsers.vlm.model' is not set; using the global model "
+                "%r for vision parsing. If that model is not vision-capable, set "
+                "'parsers.vlm.model' to one (e.g. gemini/gemini-2.5-pro).",
+                self.model,
+            )
+
+    def supports(self, suffix: str) -> bool:
+        return suffix.lower() in _SUPPORTED
+
+    def parse(self, src: Path) -> ParseResult:
+        markdown = transcribe_to_markdown(src, model=self.model)
+        logger.warning(
+            "VLM parser transcribes %s to text only; embedded figures/images are "
+            "not extracted. Use a parser like 'mineru' if you need figure extraction.",
+            src.name,
+        )
+        return ParseResult(markdown=markdown)
diff --git a/openkb/parsers/vlm_client.py b/openkb/parsers/vlm_client.py
new file mode 100644
index 00000000..1f2774f8
--- /dev/null
+++ b/openkb/parsers/vlm_client.py
@@ -0,0 +1,34 @@
+from __future__ import annotations
+
+import base64
+import mimetypes
+from pathlib import Path
+
+import litellm
+
+_DEFAULT_MODEL = "gemini/gemini-2.5-pro"
+
+_PROMPT = (
+    "Transcribe this document to clean GitHub-flavored Markdown. Preserve headings, "
+    "lists, tables (as Markdown or HTML tables), and math (as LaTeX). Output only the "
+    "Markdown content, no commentary."
+)
+
+
+def transcribe_to_markdown(src: Path, model: str | None = None, prompt: str | None = None) -> str:
+    """Send ``src`` (PDF or image) to a vision-capable LLM via litellm; return Markdown."""
+    model = model or _DEFAULT_MODEL
+    mime = mimetypes.guess_type(src.name)[0] or "application/octet-stream"
+    b64 = base64.b64encode(src.read_bytes()).decode()
+    data_uri = f"data:{mime};base64,{b64}"
+    if mime == "application/pdf":
+        # litellm's document/file content part (image_url is only for raster images).
+        media_part = {"type": "file", "file": {"file_data": data_uri}}
+    else:
+        media_part = {"type": "image_url", "image_url": {"url": data_uri}}
+    content = [
+        {"type": "text", "text": prompt or _PROMPT},
+        media_part,
+    ]
+    resp = litellm.completion(model=model, messages=[{"role": "user", "content": content}])
+    return resp.choices[0].message.content or ""
diff --git a/pyproject.toml b/pyproject.toml
index 026dea23..5d1e241c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,6 +53,9 @@ testpaths = ["tests"]
 
 [project.optional-dependencies]
 dev = ["pytest", "pytest-asyncio"]
+mistral = ["mistralai"]
+mineru = ["httpx"]
+parsers = ["mistralai", "httpx"]
 
 [tool.hatch.version]
 source = "vcs"
diff --git a/tests/test_add_command.py b/tests/test_add_command.py
index 1fb4d87f..4bdf7be1 100644
--- a/tests/test_add_command.py
+++ b/tests/test_add_command.py
@@ -70,7 +70,7 @@ def test_add_single_file_calls_helper(self, tmp_path):
         with patch("openkb.cli.add_single_file") as mock_add, \
              patch("openkb.cli._find_kb_dir", return_value=kb_dir):
             runner.invoke(cli, ["add", str(doc)])
-            mock_add.assert_called_once_with(doc, kb_dir)
+            mock_add.assert_called_once_with(doc, kb_dir, parser_override=None)
 
     def test_add_directory_calls_helper_for_each_file(self, tmp_path):
         kb_dir = self._setup_kb(tmp_path)
@@ -147,3 +147,29 @@ def test_add_short_doc_runs_compiler(self, tmp_path):
             result = runner.invoke(cli, ["add", str(doc)])
             mock_arun.assert_called_once()
             assert "OK" in result.output
+
+
+def test_add_single_file_threads_parser_override(tmp_path):
+    from unittest.mock import patch
+    from pathlib import Path
+    from openkb.cli import add_single_file
+
+    fake_result = type("R", (), {"skipped": True, "is_long_doc": False,
+                                 "file_hash": None, "raw_path": None,
+                                 "source_path": None})()
+    with patch("openkb.cli.convert_document", return_value=fake_result) as cd, \
+         patch("openkb.cli._setup_llm_key"), \
+         patch("openkb.cli.load_config", return_value={"model": "m"}):
+        add_single_file(Path("x.pdf"), tmp_path, parser_override="mistral")
+    # parser_override must reach convert_document
+    assert cd.call_args.kwargs.get("parser_override") == "mistral" \
+        or (len(cd.call_args.args) >= 3 and cd.call_args.args[2] == "mistral")
+
+
+def test_add_parser_option_rejects_invalid_choice(tmp_path):
+    from click.testing import CliRunner
+    from openkb.cli import cli
+    runner = CliRunner()
+    result = runner.invoke(cli, ["add", "--parser", "bogus", str(tmp_path / "x.pdf")])
+    assert result.exit_code != 0
+    assert "bogus" in result.output or "Invalid value" in result.output
diff --git a/tests/test_cli.py b/tests/test_cli.py
index ab3378b1..e80e272f 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -365,3 +365,17 @@ async def fake_run_query(*_args, **_kwargs):
         assert "rnn" in saved
         assert "[[concepts/multi-head-attention]]" not in saved
         assert "multi head attention" in saved
+
+
+def test_setup_llm_key_does_not_spray_unrelated_provider_keys(tmp_path, monkeypatch):
+    import os
+    from openkb.cli import _setup_llm_key
+    # KB with an openai model (known provider)
+    openkb_dir = tmp_path / ".openkb"; openkb_dir.mkdir()
+    (openkb_dir / "config.yaml").write_text("model: openai/gpt-4o\n")
+    for k in ("MISTRAL_API_KEY", "OPENAI_API_KEY", "ANTHROPIC_API_KEY"):
+        monkeypatch.delenv(k, raising=False)
+    monkeypatch.setenv("LLM_API_KEY", "sk-test")
+    _setup_llm_key(tmp_path)
+    assert os.environ.get("OPENAI_API_KEY") == "sk-test"   # active provider set
+    assert os.environ.get("MISTRAL_API_KEY") is None        # unrelated provider NOT sprayed
diff --git a/tests/test_config.py b/tests/test_config.py
index 35704a6b..0d9aae36 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -45,3 +45,8 @@ def test_load_overrides_defaults(tmp_path):
     assert loaded["pageindex_threshold"] == 100
     # Non-overridden defaults still present
     assert loaded["language"] == "en"
+
+
+def test_default_parser_is_local():
+    from openkb.config import DEFAULT_CONFIG
+    assert DEFAULT_CONFIG["parser"] == "local"
diff --git a/tests/test_converter.py b/tests/test_converter.py
index d7475b09..6b5f2e41 100644
--- a/tests/test_converter.py
+++ b/tests/test_converter.py
@@ -85,7 +85,7 @@ def test_short_pdf_converted_via_pymupdf(self, kb_dir, tmp_path):
 
         with (
             patch("openkb.converter.pymupdf.open") as mock_mu,
-            patch("openkb.converter.convert_pdf_with_images", return_value="# Short PDF\n\nConverted.") as mock_cpwi,
+            patch("openkb.parsers.local.convert_pdf_with_images", return_value="# Short PDF\n\nConverted.") as mock_cpwi,
         ):
             fake_doc = MagicMock()
             fake_doc.page_count = 5  # below default threshold of 20
@@ -128,3 +128,67 @@ def test_long_pdf_returns_is_long_doc(self, kb_dir, tmp_path):
         assert result.source_path is None
         assert result.skipped is False
         assert result.raw_path is not None
+
+
+from openkb.parsers.base import ParseResult
+
+
+class TestConvertDocumentParserSelection:
+    def test_uses_get_parser_and_localizes(self, kb_dir):
+        src = kb_dir / "raw" / "paper.pdf"
+        src.write_bytes(b"%PDF-1.4 fake")
+
+        fake = MagicMock()
+        fake.supports.return_value = True
+        fake.parse.return_value = ParseResult(markdown="HELLO", images={"a.png": b"X"})
+
+        with patch("openkb.converter.get_pdf_page_count", return_value=1), \
+             patch("openkb.converter.get_parser", return_value=fake) as gp, \
+             patch("openkb.converter.localize_images", return_value="HELLO-LOCALIZED") as li:
+            result = convert_document(src, kb_dir)
+
+        gp.assert_called_once()
+        assert gp.call_args.kwargs["doc_name"] == "paper"
+        assert gp.call_args.kwargs["images_dir"] is not None
+        li.assert_called_once()
+        assert result.source_path.read_text(encoding="utf-8") == "HELLO-LOCALIZED"
+
+    def test_falls_back_to_local_for_unsupported_suffix(self, kb_dir):
+        src = kb_dir / "raw" / "notes.md"
+        src.write_text("# md", encoding="utf-8")
+
+        online = MagicMock()
+        online.supports.return_value = False  # online parser can't do .md
+        with patch("openkb.converter.get_parser", return_value=online), \
+             patch("openkb.converter.LocalParser") as LP:
+            LP.return_value.parse.return_value = ParseResult(markdown="# md")
+            convert_document(src, kb_dir)
+        LP.assert_called_once()  # fell back to LocalParser
+
+    def test_local_parser_skips_redundant_localize(self, kb_dir):
+        src = kb_dir / "raw" / "notes.md"
+        src.write_text("# md", encoding="utf-8")
+        local = MagicMock()
+        local.name = "local"
+        local.supports.return_value = True
+        local.parse.return_value = ParseResult(markdown="# md final")
+        with patch("openkb.converter.get_parser", return_value=local), \
+             patch("openkb.converter.localize_images") as li:
+            result = convert_document(src, kb_dir)
+        li.assert_not_called()                      # local path skips localize_images
+        assert result.source_path.read_text(encoding="utf-8") == "# md final"
+
+    def test_warns_on_silent_downgrade(self, kb_dir, caplog):
+        import logging as _logging
+        src = kb_dir / "raw" / "notes.md"
+        src.write_text("# md", encoding="utf-8")
+        online = MagicMock()
+        online.name = "mistral"
+        online.supports.return_value = False
+        with patch("openkb.converter.get_parser", return_value=online), \
+             patch("openkb.converter.LocalParser") as LP:
+            LP.return_value.name = "local"
+            LP.return_value.parse.return_value = ParseResult(markdown="# md")
+            with caplog.at_level(_logging.WARNING):
+                convert_document(src, kb_dir)
+        assert any("falling back to the local parser" in r.message for r in caplog.records)
diff --git a/tests/test_images.py b/tests/test_images.py
index 9abb3ec2..26b8ed1b 100644
--- a/tests/test_images.py
+++ b/tests/test_images.py
@@ -4,7 +4,7 @@
 import base64
 
 
-from openkb.images import copy_relative_images, extract_base64_images
+from openkb.images import copy_relative_images, extract_base64_images, localize_images
 
 
 # ---------------------------------------------------------------------------
@@ -164,3 +164,83 @@ def test_multiple_relative_images_all_copied(self, tmp_path):
         assert "![b](sources/images/doc/b.jpg)" in result
         assert (images_dir / "a.png").exists()
         assert (images_dir / "b.jpg").exists()
+
+
+# ---------------------------------------------------------------------------
+# localize_images
+# ---------------------------------------------------------------------------
+
+
+def test_localize_images_writes_bytes_and_rewrites_bare_refs(tmp_path):
+    images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
+    md = "Before\n\n![fig](p1_img1.png)\n\nAfter"
+    out = localize_images(md, {"p1_img1.png": b"PNGDATA"}, "doc", images_dir)
+    assert "![fig](sources/images/doc/p1_img1.png)" in out
+    assert (images_dir / "p1_img1.png").read_bytes() == b"PNGDATA"
+
+
+def test_localize_images_handles_inline_base64(tmp_path):
+    import base64
+    images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
+    payload = base64.b64encode(b"JPEGDATA").decode()
+    md = f"![x](data:image/jpeg;base64,{payload})"
+    out = localize_images(md, {}, "doc", images_dir)
+    assert "sources/images/doc/img_001.jpeg" in out
+    assert (images_dir / "img_001.jpeg").read_bytes() == b"JPEGDATA"
+
+
+def test_localize_images_leaves_unreferenced_bytes_on_disk(tmp_path):
+    images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
+    out = localize_images("no images here", {"orphan.png": b"X"}, "doc", images_dir)
+    assert out == "no images here"
+    assert (images_dir / "orphan.png").read_bytes() == b"X"
+
+
+def test_localize_images_filename_with_regex_metachars(tmp_path):
+    images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
+    weird = r"img\g<9>.png"  # backslash-escape-like name must not crash re.sub
+    md = f"![f]({weird})"
+    out = localize_images(md, {weird: b"DATA"}, "doc", images_dir)
+    assert f"sources/images/doc/{weird}" in out
+    assert (images_dir / weird).read_bytes() == b"DATA"
+
+
+def test_localize_images_strips_path_traversal_in_filename(tmp_path):
+    images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
+    md = "![bad](../../evil.png)"
+    out = localize_images(md, {"../../evil.png": b"DATA"}, "doc", images_dir)
+    # bytes written INSIDE images_dir under the basename only — no escape
+    assert (images_dir / "evil.png").read_bytes() == b"DATA"
+    assert not (tmp_path / "evil.png").exists()
+    assert not (images_dir.parent.parent / "evil.png").exists()
+    # the original ref is rewritten to the sanitized canonical path
+    assert "sources/images/doc/evil.png" in out
+
+
+def test_localize_images_absolute_filename_stays_inside(tmp_path):
+    images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
+    out = localize_images("![x](/etc/x.png)", {"/etc/x.png": b"D"}, "doc", images_dir)
+    assert (images_dir / "x.png").read_bytes() == b"D"
+    assert "sources/images/doc/x.png" in out
+
+
+def test_localize_images_rewrites_directory_prefixed_target(tmp_path):
+    images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
+    md = "![p](images/fig.png)\n\n![q](./sub/images/other.png)"
+    out = localize_images(md, {"fig.png": b"A", "other.png": b"B"}, "doc", images_dir)
+    assert "![p](sources/images/doc/fig.png)" in out
+    assert "![q](sources/images/doc/other.png)" in out
+    assert (images_dir / "fig.png").read_bytes() == b"A"
+    assert (images_dir / "other.png").read_bytes() == b"B"
+
+
+def test_localize_images_preserves_title_attribute(tmp_path):
+    images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
+    out = localize_images('![a](fig.png "Figure 1")', {"fig.png": b"X"}, "doc", images_dir)
+    assert '![a](sources/images/doc/fig.png "Figure 1")' in out
+
+
+def test_localize_images_inner_whitespace(tmp_path):
+    images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
+    out = localize_images("![a]( fig.png )", {"fig.png": b"X"}, "doc", images_dir)
+    assert "sources/images/doc/fig.png" in out
diff --git a/tests/test_parsers_base.py b/tests/test_parsers_base.py
new file mode 100644
index 00000000..1c119a32
--- /dev/null
+++ b/tests/test_parsers_base.py
@@ -0,0 +1,24 @@
+"""Tests for the parser abstraction base types."""
+from __future__ import annotations
+
+import pytest
+
+from openkb.parsers.base import ParseResult, Parser
+
+
+def test_parse_result_defaults_to_empty_images():
+    pr = ParseResult(markdown="# Hi")
+    assert pr.markdown == "# Hi"
+    assert pr.images == {}
+
+
+def test_parser_is_abstract():
+    with pytest.raises(TypeError):
+        Parser()  # cannot instantiate abstract base
+
+
+def test_concrete_parser_must_implement_parse_and_supports():
+    class Incomplete(Parser):
+        name = "incomplete"
+    with pytest.raises(TypeError):
+        Incomplete()
diff --git a/tests/test_parsers_local.py b/tests/test_parsers_local.py
new file mode 100644
index 00000000..af17ed38
--- /dev/null
+++ b/tests/test_parsers_local.py
@@ -0,0 +1,48 @@
+"""Tests for LocalParser — preserves legacy md/pdf/markitdown behavior."""
+from __future__ import annotations
+
+from pathlib import Path
+from unittest.mock import patch
+
+from openkb.parsers.local import LocalParser
+from openkb.parsers.base import ParseResult
+
+
+def test_supports_all_known_extensions():
+    p = LocalParser()
+    for ext in [".pdf", ".md", ".markdown", ".docx", ".pptx", ".xlsx", ".html", ".txt", ".csv"]:
+        assert p.supports(ext) is True
+
+
+def test_parse_md_reads_text(tmp_path):
+    src = tmp_path / "n.md"
+    src.write_text("# Title\n\nbody", encoding="utf-8")
+    images_dir = tmp_path / "img" / "n"
+    p = LocalParser(doc_name="n", images_dir=images_dir, source_dir=tmp_path)
+    result = p.parse(src)
+    assert isinstance(result, ParseResult)
+    assert result.markdown.startswith("# Title")
+
+
+def test_parse_pdf_delegates_to_convert_pdf_with_images(tmp_path):
+    src = tmp_path / "doc.pdf"
+    src.write_bytes(b"%PDF-1.4 fake")
+    images_dir = tmp_path / "img" / "doc"
+    with patch("openkb.parsers.local.convert_pdf_with_images", return_value="PDF MD") as m:
+        p = LocalParser(doc_name="doc", images_dir=images_dir, source_dir=tmp_path)
+        result = p.parse(src)
+    m.assert_called_once_with(src, "doc", images_dir)
+    assert result.markdown == "PDF MD"
+
+
+def test_parse_other_uses_markitdown_and_extracts_base64(tmp_path):
+    src = tmp_path / "deck.pptx"
+    src.write_bytes(b"PK fake")
+    images_dir = tmp_path / "img" / "deck"
+    with patch("openkb.parsers.local.MarkItDown") as fake_mid, \
+         patch("openkb.parsers.local.extract_base64_images", return_value="CLEANED") as ex:
+        fake_mid.return_value.convert.return_value.text_content = "MARKITDOWN MD"
+        p = LocalParser(doc_name="deck", images_dir=images_dir, source_dir=tmp_path)
+        result = p.parse(src)
+    ex.assert_called_once_with("MARKITDOWN MD", "deck", images_dir)
+    assert result.markdown == "CLEANED"
diff --git a/tests/test_parsers_mineru.py b/tests/test_parsers_mineru.py
new file mode 100644
index 00000000..f81c8033
--- /dev/null
+++ b/tests/test_parsers_mineru.py
@@ -0,0 +1,259 @@
+from __future__ import annotations
+
+import io
+import sys
+import types
+import zipfile
+from unittest.mock import MagicMock
+
+import pytest
+
+from openkb.parsers.base import ParseResult
+
+
+def test_supports_office_and_pdf():
+    from openkb.parsers.mineru import MineruParser
+    p = MineruParser({})
+    assert p.supports(".pdf") is True
+    assert p.supports(".docx") is True
+    assert p.supports(".md") is False
+
+
+def test_self_hosted_requires_base_url(tmp_path):
+    from openkb.parsers.mineru import MineruParser
+    p = MineruParser({"mode": "self_hosted"})
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    with pytest.raises(RuntimeError) as exc:
+        p.parse(src)
+    assert "base_url" in str(exc.value)
+
+
+def test_cloud_requires_api_key(monkeypatch, tmp_path):
+    monkeypatch.delenv("MINERU_API_KEY", raising=False)
+    from openkb.parsers.mineru import MineruParser
+    p = MineruParser({"mode": "cloud"})
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    with pytest.raises(RuntimeError) as exc:
+        p.parse(src)
+    assert "MINERU_API_KEY" in str(exc.value)
+
+
+def test_self_hosted_parses_zip(monkeypatch, tmp_path):
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w") as zf:
+        zf.writestr("full.md", "# Mineru\n\n![p](images/fig.png)")
+        zf.writestr("images/fig.png", b"PNGBYTES")
+    zip_bytes = buf.getvalue()
+
+    fake_resp = MagicMock(status_code=200, content=zip_bytes)
+    fake_resp.raise_for_status = MagicMock()
+    fake_client = MagicMock()
+    fake_client.__enter__ = MagicMock(return_value=fake_client)
+    fake_client.__exit__ = MagicMock(return_value=False)
+    fake_client.post.return_value = fake_resp
+
+    httpx_mod = types.ModuleType("httpx")
+    httpx_mod.Client = MagicMock(return_value=fake_client)
+    monkeypatch.setitem(sys.modules, "httpx", httpx_mod)
+
+    from openkb.parsers.mineru import MineruParser
+    p = MineruParser({"mode": "self_hosted", "base_url": "http://localhost:8000"})
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    result = p.parse(src)
+    assert isinstance(result, ParseResult)
+    assert "Mineru" in result.markdown
+    assert result.images["fig.png"] == b"PNGBYTES"
+    # _result_from_zip no longer rewrites links; the raw 'images/fig.png' survives
+    assert "images/fig.png" in result.markdown
+    # localize_images (which now rewrites by basename) canonicalizes it
+    from openkb.images import localize_images
+    md2 = localize_images(result.markdown, result.images, "d", tmp_path / "imgs")
+    assert "sources/images/d/fig.png" in md2
+
+
+def test_cloud_flow_polls_then_downloads(monkeypatch, tmp_path):
+    monkeypatch.setenv("MINERU_API_KEY", "key")
+    monkeypatch.setattr("openkb.parsers.mineru.time.sleep", lambda *a, **k: None)
+
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w") as zf:
+        zf.writestr("full.md", "# Cloud")
+        zf.writestr("images/fig.png", b"ZBYTES")
+    zip_bytes = buf.getvalue()
+
+    def _resp(json_data=None, content=None):
+        r = MagicMock()
+        r.raise_for_status = MagicMock()
+        if json_data is not None:
+            r.json.return_value = json_data
+        if content is not None:
+            r.content = content
+        return r
+
+    client = MagicMock()
+    client.__enter__ = MagicMock(return_value=client)
+    client.__exit__ = MagicMock(return_value=False)
+    client.post.return_value = _resp(
+        json_data={"data": {"batch_id": "b1", "file_urls": ["https://upload"]}}
+    )
+    client.put.return_value = _resp()
+
+    poll_url = "https://mineru.net/api/v4/extract-results/batch/b1"
+    poll_running = _resp(json_data={"data": {"extract_result": [{"state": "running"}]}})
+    poll_done = _resp(
+        json_data={"data": {"extract_result": [{"state": "done", "full_zip_url": "https://zip"}]}}
+    )
+    zip_resp = _resp(content=zip_bytes)
+
+    def _get(url, *a, **k):
+        if url == "https://zip":
+            return zip_resp
+        assert url == poll_url
+        _get.calls += 1
+        return poll_running if _get.calls == 1 else poll_done
+
+    _get.calls = 0
+    client.get.side_effect = _get
+
+    httpx_mod = types.ModuleType("httpx")
+    httpx_mod.Client = MagicMock(return_value=client)
+    monkeypatch.setitem(sys.modules, "httpx", httpx_mod)
+
+    from openkb.parsers.mineru import MineruParser
+    p = MineruParser({"mode": "cloud", "poll_interval": 0})
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    result = p.parse(src)
+
+    assert isinstance(result, ParseResult)
+    assert "Cloud" in result.markdown
+    assert result.images["fig.png"] == b"ZBYTES"
+    # drove the full poll loop: running once, then done
+    assert _get.calls == 2
+
+
+def test_poll_interval_zero_is_clamped_to_positive():
+    from openkb.parsers.mineru import MineruParser
+    assert MineruParser({"poll_interval": 0}).poll_interval > 0
+    assert MineruParser({"poll_interval": -5}).poll_interval > 0
+    assert MineruParser({"poll_interval": 2}).poll_interval == 2
+
+
+def test_result_from_zip_does_not_rewrite_links(tmp_path):
+    import io, zipfile
+    # The images/ -> bare rewrite moved OUT of _result_from_zip into
+    # localize_images; _result_from_zip must leave the markdown link text intact.
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w") as zf:
+        zf.writestr("full.md", "See path other_images/fig.png in text.\n\n![p](images/fig.png)")
+        zf.writestr("images/fig.png", b"PNG")
+    from openkb.parsers.mineru import _result_from_zip
+    result = _result_from_zip(buf.getvalue())
+    assert "![p](images/fig.png)" in result.markdown   # link text unchanged
+    assert "other_images/fig.png" in result.markdown    # unrelated prose untouched
+    assert result.images["fig.png"] == b"PNG"           # images keyed by basename
+
+
+def test_cloud_empty_extract_result_then_done(monkeypatch, tmp_path):
+    import io, sys, types, zipfile
+    from unittest.mock import MagicMock
+    monkeypatch.setenv("MINERU_API_KEY", "key")
+    monkeypatch.setattr("openkb.parsers.mineru.time.sleep", lambda *a, **k: None)
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w") as zf:
+        zf.writestr("full.md", "# Ok")
+    zip_bytes = buf.getvalue()
+
+    def _resp(json_data=None, content=None):
+        r = MagicMock(); r.raise_for_status = MagicMock()
+        if json_data is not None: r.json.return_value = json_data
+        if content is not None: r.content = content
+        return r
+    client = MagicMock()
+    client.__enter__ = MagicMock(return_value=client); client.__exit__ = MagicMock(return_value=False)
+    client.post.return_value = _resp(json_data={"data": {"batch_id": "b1", "file_urls": ["https://up"]}})
+    client.put.return_value = _resp()
+    empty = _resp(json_data={"data": {"extract_result": []}})            # queued: empty list
+    done = _resp(json_data={"data": {"extract_result": [{"state": "done", "full_zip_url": "https://zip"}]}})
+    zipr = _resp(content=zip_bytes)
+    def _get(url, *a, **k):
+        if url == "https://zip": return zipr
+        _get.n += 1
+        return empty if _get.n == 1 else done
+    _get.n = 0
+    client.get.side_effect = _get
+    httpx_mod = types.ModuleType("httpx"); httpx_mod.Client = MagicMock(return_value=client)
+    monkeypatch.setitem(sys.modules, "httpx", httpx_mod)
+    from openkb.parsers.mineru import MineruParser
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    result = MineruParser({"mode": "cloud", "poll_interval": 1}).parse(src)
+    assert "Ok" in result.markdown   # survived the empty-list poll without crashing
+
+
+def test_timeout_invalid_is_clamped():
+    from openkb.parsers.mineru import MineruParser
+    assert MineruParser({"timeout": 0}).timeout == 600
+    assert MineruParser({"timeout": "x"}).timeout == 600
+    assert MineruParser({"timeout": 30}).timeout == 30
+
+
+def test_cloud_api_error_envelope_raises(monkeypatch, tmp_path):
+    import sys, types
+    from unittest.mock import MagicMock
+    monkeypatch.setenv("MINERU_API_KEY", "key")
+    r = MagicMock(); r.raise_for_status = MagicMock()
+    r.json.return_value = {"code": -10001, "msg": "token expired", "data": None}
+    client = MagicMock()
+    client.__enter__ = MagicMock(return_value=client); client.__exit__ = MagicMock(return_value=False)
+    client.post.return_value = r
+    httpx_mod = types.ModuleType("httpx"); httpx_mod.Client = MagicMock(return_value=client)
+    monkeypatch.setitem(sys.modules, "httpx", httpx_mod)
+    from openkb.parsers.mineru import MineruParser
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    import pytest
+    with pytest.raises(RuntimeError) as exc:
+        MineruParser({"mode": "cloud"}).parse(src)
+    assert "token expired" in str(exc.value) or "-10001" in str(exc.value)
+
+
+def test_cloud_empty_file_urls_raises(monkeypatch, tmp_path):
+    import sys, types
+    from unittest.mock import MagicMock
+    monkeypatch.setenv("MINERU_API_KEY", "key")
+    r = MagicMock(); r.raise_for_status = MagicMock()
+    r.json.return_value = {"code": 0, "data": {"batch_id": "b1", "file_urls": []}}
+    client = MagicMock()
+    client.__enter__ = MagicMock(return_value=client); client.__exit__ = MagicMock(return_value=False)
+    client.post.return_value = r
+    httpx_mod = types.ModuleType("httpx"); httpx_mod.Client = MagicMock(return_value=client)
+    monkeypatch.setitem(sys.modules, "httpx", httpx_mod)
+    from openkb.parsers.mineru import MineruParser
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    import pytest
+    with pytest.raises(RuntimeError) as exc:
+        MineruParser({"mode": "cloud"}).parse(src)
+    assert "upload URL" in str(exc.value)
+
+
+def test_full_md_basename_preferred_over_endswith(tmp_path):
+    import io, zipfile
+    from openkb.parsers.mineru import _result_from_zip
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w") as zf:
+        zf.writestr("careful.md", "# WRONG")     # ends with 'full.md' but isn't it
+        zf.writestr("full.md", "# RIGHT")
+    result = _result_from_zip(buf.getvalue())
+    assert "RIGHT" in result.markdown
+    assert "WRONG" not in result.markdown
+
+
+def test_image_basename_collision_warns(tmp_path, caplog):
+    import io, zipfile, logging as _logging
+    from openkb.parsers.mineru import _result_from_zip
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w") as zf:
+        zf.writestr("full.md", "# x")
+        zf.writestr("images/fig.png", b"A")
+        zf.writestr("sub/fig.png", b"B")
+    with caplog.at_level(_logging.WARNING):
+        result = _result_from_zip(buf.getvalue())
+    assert any("fig.png" in r.message for r in caplog.records)
diff --git a/tests/test_parsers_mistral.py b/tests/test_parsers_mistral.py
new file mode 100644
index 00000000..8283102d
--- /dev/null
+++ b/tests/test_parsers_mistral.py
@@ -0,0 +1,122 @@
+from __future__ import annotations
+
+import base64
+import sys
+import types
+from unittest.mock import MagicMock
+
+import pytest
+
+from openkb.parsers.base import ParseResult
+
+
+def _install_fake_mistralai(monkeypatch, client_instance):
+    mod = types.ModuleType("mistralai")
+    mod.Mistral = MagicMock(return_value=client_instance)
+    monkeypatch.setitem(sys.modules, "mistralai", mod)
+    return mod
+
+
+def test_supports_pdf():
+    from openkb.parsers.mistral import MistralParser
+    p = MistralParser({})
+    assert p.supports(".pdf") is True
+    assert p.supports(".docx") is False
+
+
+def test_missing_key_raises_actionable(monkeypatch, tmp_path):
+    monkeypatch.delenv("MISTRAL_API_KEY", raising=False)
+    from openkb.parsers.mistral import MistralParser
+    p = MistralParser({})
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    with pytest.raises(RuntimeError) as exc:
+        p.parse(src)
+    assert "MISTRAL_API_KEY" in str(exc.value)
+
+
+def test_parse_collects_markdown_and_decodes_images(monkeypatch, tmp_path):
+    monkeypatch.setenv("MISTRAL_API_KEY", "k")
+    img_bytes = b"IMGDATA"
+    img_b64 = base64.b64encode(img_bytes).decode()
+
+    client = MagicMock()
+    client.files.upload.return_value = MagicMock(id="file-1")
+    client.files.get_signed_url.return_value = MagicMock(url="https://signed")
+    page = MagicMock()
+    page.markdown = "Text ![img-0.png](img-0.png)"
+    page.images = [MagicMock(id="img-0.png", image_base64=f"data:image/png;base64,{img_b64}")]
+    client.ocr.process.return_value = MagicMock(pages=[page])
+
+    _install_fake_mistralai(monkeypatch, client)
+    from openkb.parsers.mistral import MistralParser
+    p = MistralParser({})
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    result = p.parse(src)
+
+    assert isinstance(result, ParseResult)
+    assert "img-0.png" in result.markdown
+    assert result.images["img-0.png"] == img_bytes
+
+
+def test_missing_package_raises_install_hint(monkeypatch, tmp_path):
+    monkeypatch.setenv("MISTRAL_API_KEY", "k")
+    monkeypatch.setitem(sys.modules, "mistralai", None)  # force ImportError
+    from openkb.parsers.mistral import MistralParser
+    p = MistralParser({})
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    with pytest.raises(RuntimeError) as exc:
+        p.parse(src)
+    assert "openkb[mistral]" in str(exc.value)
+
+
+def test_undecodable_image_logged_and_skipped(monkeypatch, tmp_path, caplog):
+    import logging as _logging
+    monkeypatch.setenv("MISTRAL_API_KEY", "k")
+    client = MagicMock()
+    client.files.upload.return_value = MagicMock(id="file-1")
+    client.files.get_signed_url.return_value = MagicMock(url="https://signed")
+    page = MagicMock()
+    page.markdown = "Text ![bad.png](bad.png)"
+    page.images = [MagicMock(id="bad.png", image_base64="!!!not-base64!!!")]
+    client.ocr.process.return_value = MagicMock(pages=[page])
+    _install_fake_mistralai(monkeypatch, client)
+    from openkb.parsers.mistral import MistralParser
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    with caplog.at_level(_logging.WARNING):
+        result = MistralParser({}).parse(src)
+    assert "bad.png" not in result.images
+    assert any("bad.png" in r.message for r in caplog.records)
+
+
+def test_uploaded_file_is_deleted(monkeypatch, tmp_path):
+    import sys, types
+    from unittest.mock import MagicMock
+    monkeypatch.setenv("MISTRAL_API_KEY", "k")
+    client = MagicMock()
+    client.files.upload.return_value = MagicMock(id="file-1")
+    client.files.get_signed_url.return_value = MagicMock(url="https://signed")
+    client.ocr.process.return_value = MagicMock(pages=[])
+    mod = types.ModuleType("mistralai"); mod.Mistral = MagicMock(return_value=client)
+    monkeypatch.setitem(sys.modules, "mistralai", mod)
+    from openkb.parsers.mistral import MistralParser
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    MistralParser({}).parse(src)
+    client.files.delete.assert_called_once_with(file_id="file-1")
+
+
+def test_uploaded_file_deleted_even_on_ocr_error(monkeypatch, tmp_path):
+    import sys, types
+    from unittest.mock import MagicMock
+    import pytest
+    monkeypatch.setenv("MISTRAL_API_KEY", "k")
+    client = MagicMock()
+    client.files.upload.return_value = MagicMock(id="file-2")
+    client.files.get_signed_url.return_value = MagicMock(url="https://signed")
+    client.ocr.process.side_effect = RuntimeError("ocr boom")
+    mod = types.ModuleType("mistralai"); mod.Mistral = MagicMock(return_value=client)
+    monkeypatch.setitem(sys.modules, "mistralai", mod)
+    from openkb.parsers.mistral import MistralParser
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    with pytest.raises(RuntimeError):
+        MistralParser({}).parse(src)
+    client.files.delete.assert_called_once_with(file_id="file-2")
diff --git a/tests/test_parsers_registry.py b/tests/test_parsers_registry.py
new file mode 100644
index 00000000..612ed259
--- /dev/null
+++ b/tests/test_parsers_registry.py
@@ -0,0 +1,40 @@
+"""Tests for parser selection / registry."""
+from __future__ import annotations
+
+import pytest
+
+from openkb.parsers.registry import get_parser
+from openkb.parsers.local import LocalParser
+
+
+def _kwargs():
+    return {"doc_name": "d", "images_dir": None, "source_dir": None}
+
+
+def test_default_is_local():
+    p = get_parser({}, **_kwargs())
+    assert isinstance(p, LocalParser)
+
+
+def test_explicit_local():
+    p = get_parser({"parser": "local"}, **_kwargs())
+    assert isinstance(p, LocalParser)
+
+
+def test_override_wins_over_config():
+    p = get_parser({"parser": "mistral"}, override="local", **_kwargs())
+    assert isinstance(p, LocalParser)
+
+
+def test_unknown_name_raises_with_valid_options():
+    with pytest.raises(ValueError) as exc:
+        get_parser({"parser": "nope"}, **_kwargs())
+    assert "nope" in str(exc.value)
+    assert "local" in str(exc.value)
+
+
+def test_valid_parsers_matches_dispatch():
+    from openkb.parsers.registry import VALID_PARSERS, _ONLINE_PARSERS
+    # local + every online factory key, no drift
+    assert set(VALID_PARSERS) == {"local", *_ONLINE_PARSERS}
+    assert VALID_PARSERS[0] == "local"
diff --git a/tests/test_parsers_vlm.py b/tests/test_parsers_vlm.py
new file mode 100644
index 00000000..b5a99400
--- /dev/null
+++ b/tests/test_parsers_vlm.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+from unittest.mock import patch
+
+from openkb.parsers.vlm import VLMParser
+from openkb.parsers.base import ParseResult
+
+
+def test_supports_pdf_only_for_v1():
+    p = VLMParser({}, model="gemini/gemini-2.5-pro")
+    assert p.supports(".pdf") is True
+    assert p.supports(".md") is False
+    assert p.supports(".docx") is False
+
+
+def test_parse_calls_transcribe_with_configured_model(tmp_path):
+    src = tmp_path / "doc.pdf"
+    src.write_bytes(b"%PDF")
+    p = VLMParser({"model": "gpt-4o"}, model="fallback-model")
+    with patch("openkb.parsers.vlm.transcribe_to_markdown", return_value="# MD") as t:
+        result = p.parse(src)
+    t.assert_called_once_with(src, model="gpt-4o")
+    assert isinstance(result, ParseResult)
+    assert result.markdown == "# MD"
+
+
+def test_parse_falls_back_to_global_model(tmp_path):
+    src = tmp_path / "doc.pdf"
+    src.write_bytes(b"%PDF")
+    p = VLMParser({}, model="global-model")
+    with patch("openkb.parsers.vlm.transcribe_to_markdown", return_value="x") as t:
+        p.parse(src)
+    t.assert_called_once_with(src, model="global-model")
+
+
+def test_warns_when_falling_back_to_global_model(caplog):
+    import logging as _logging
+    with caplog.at_level(_logging.WARNING):
+        VLMParser({}, model="gpt-5.4-mini")
+    assert any("parsers.vlm.model" in r.message for r in caplog.records)
+
+
+def test_no_warning_when_vlm_model_set(caplog):
+    import logging as _logging
+    with caplog.at_level(_logging.WARNING):
+        VLMParser({"model": "gemini/gemini-2.5-pro"}, model="gpt-5.4-mini")
+    assert not any("parsers.vlm.model" in r.message for r in caplog.records)
+
+
+def test_parse_warns_text_only(tmp_path, caplog):
+    import logging as _logging
+    from unittest.mock import patch
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    p = VLMParser({"model": "gemini/gemini-2.5-pro"})
+    with patch("openkb.parsers.vlm.transcribe_to_markdown", return_value="# md"):
+        with caplog.at_level(_logging.WARNING):
+            p.parse(src)
+    assert any("text only" in r.message for r in caplog.records)
diff --git a/tests/test_parsers_vlm_client.py b/tests/test_parsers_vlm_client.py
new file mode 100644
index 00000000..3703c179
--- /dev/null
+++ b/tests/test_parsers_vlm_client.py
@@ -0,0 +1,61 @@
+"""Tests for the reusable litellm vision client."""
+from __future__ import annotations
+
+from unittest.mock import MagicMock, patch
+
+from openkb.parsers.vlm_client import transcribe_to_markdown
+
+
+def _fake_response(text):
+    resp = MagicMock()
+    resp.choices = [MagicMock(message=MagicMock(content=text))]
+    return resp
+
+
+def test_transcribe_pdf_sends_data_uri_and_returns_content(tmp_path):
+    src = tmp_path / "doc.pdf"
+    src.write_bytes(b"%PDF-1.4 data")
+    with patch("openkb.parsers.vlm_client.litellm.completion",
+               return_value=_fake_response("# Parsed")) as comp:
+        out = transcribe_to_markdown(src, model="gemini/gemini-2.5-pro")
+    assert out == "# Parsed"
+    _, kwargs = comp.call_args
+    assert kwargs["model"] == "gemini/gemini-2.5-pro"
+    content = kwargs["messages"][0]["content"]
+    assert any("base64" in str(part) for part in content)
+
+
+def test_default_model_used_when_none(tmp_path):
+    src = tmp_path / "img.png"
+    src.write_bytes(b"PNG")
+    with patch("openkb.parsers.vlm_client.litellm.completion",
+               return_value=_fake_response("desc")) as comp:
+        transcribe_to_markdown(src, model=None)
+    _, kwargs = comp.call_args
+    assert kwargs["model"]  # some non-empty default
+
+
+def test_pdf_uses_file_content_part(tmp_path):
+    src = tmp_path / "doc.pdf"
+    src.write_bytes(b"%PDF-1.4 data")
+    with patch("openkb.parsers.vlm_client.litellm.completion",
+               return_value=_fake_response("x")) as comp:
+        transcribe_to_markdown(src, model="some/model")
+    content = comp.call_args.kwargs["messages"][0]["content"]
+    file_parts = [p for p in content if p.get("type") == "file"]
+    assert len(file_parts) == 1
+    assert file_parts[0]["file"]["file_data"].startswith("data:application/pdf;base64,")
+    assert not any(p.get("type") == "image_url" for p in content)
+
+
+def test_image_uses_image_url_content_part(tmp_path):
+    src = tmp_path / "fig.png"
+    src.write_bytes(b"\x89PNG\r\n")
+    with patch("openkb.parsers.vlm_client.litellm.completion",
+               return_value=_fake_response("x")) as comp:
+        transcribe_to_markdown(src, model="some/model")
+    content = comp.call_args.kwargs["messages"][0]["content"]
+    image_parts = [p for p in content if p.get("type") == "image_url"]
+    assert len(image_parts) == 1
+    assert image_parts[0]["image_url"]["url"].startswith("data:image/png;base64,")
+    assert not any(p.get("type") == "file" for p in content)