From 20fb553328476883f4ce5522835150480531bb47 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Fri, 30 Jan 2026 11:30:57 +0000
Subject: [PATCH] feat: optimize example test discovery and execution speed

- Add comment-based pytest markers for examples (cleaner than decorators)
- Enable qualitative tests by default, add 'slow' marker for >5min tests
- Improve test infrastructure with better skip logic and collection
- Fix test failures: watsonx credentials, pytest imports, heavy RAM markers
- Fix mypy errors in tools.py from upstream changes

Resolves test discovery performance issues and improves CI reliability.
---
 AGENTS.md                                     |  39 +-
 README.md                                     |  22 +-
 docs/examples/aLora/101_example.py            |   2 +
 docs/examples/agents/react.py                 |   2 +
 docs/examples/agents/react_instruct.py        |   2 +
 docs/examples/conftest.py                     | 334 +++++++++++++++++-
 .../context/contexts_with_sampling.py         |   2 +
 .../generative_slots/generate_with_context.py |   2 +
 .../generative_slots/generative_gsm8k.py      |   2 +
 .../generative_slots/generative_slots.py      |   2 +
 .../generative_slots_with_requirements.py     |   2 +
 .../decision_aides.py                         |   2 +
 .../summarize_and_decide.py                   |   2 +
 .../inter_module_composition/summarizers.py   |   2 +
 .../generative_slots/investment_advice.py     |   2 +
 docs/examples/hello_world.py                  |   2 +
 docs/examples/helper/helpers.py               |   2 +
 .../vision_litellm_backend.py                 |   2 +
 .../image_text_models/vision_ollama_chat.py   |   2 +
 .../vision_openai_examples.py                 |   2 +
 .../101_with_gen_slots.py                     |   2 +
 .../advanced_with_m_instruct.py               |   2 +
 .../instruct_validate_repair/101_email.py     |   2 +
 .../101_email_comparison.py                   |   2 +
 .../101_email_with_requirements.py            |   2 +
 .../101_email_with_validate.py                |   2 +
 .../advanced_email_with_validate_function.py  |   2 +
 docs/examples/intrinsics/answer_relevance.py  |   2 +
 docs/examples/intrinsics/answerability.py     |   2 +
 docs/examples/intrinsics/citations.py         |   2 +
 docs/examples/intrinsics/context_relevance.py |   2 +
 .../intrinsics/hallucination_detection.py     |   2 +
 docs/examples/intrinsics/intrinsics.py        |   2 +
 docs/examples/intrinsics/query_rewrite.py     |   2 +
 .../library_interop/langchain_messages.py     |   2 +
 .../m_serve/m_serve_example_simple.py         |   2 +
 docs/examples/melp/lazy.py                    |   2 +
 docs/examples/melp/lazy_fib.py                |   2 +
 docs/examples/melp/lazy_fib_sample.py         |   2 +
 docs/examples/melp/simple_example.py          |   2 +
 docs/examples/melp/states.py                  |   2 +
 docs/examples/mify/mify.py                    |   2 +
 docs/examples/mify/rich_document_advanced.py  |   2 +
 .../examples/mify/rich_table_execute_basic.py |   2 +
 docs/examples/mini_researcher/context_docs.py |   2 +
 docs/examples/mini_researcher/researcher.py   |   2 +
 docs/examples/mobject/table.py                |   2 +
 docs/examples/rag/mellea_pdf.py               |   2 +
 docs/examples/safety/guardian.py              |   2 +
 docs/examples/safety/guardian_huggingface.py  |   2 +
 docs/examples/safety/repair_with_guardian.py  |   2 +
 .../creating_a_new_type_of_session.py         |   2 +
 docs/examples/sofai/sofai_graph_coloring.py   |   2 +
 docs/examples/tools/interpreter_example.py    |   2 +
 .../compositionality_with_generative_slots.py |   2 +
 docs/examples/tutorial/context_example.py     |   4 +-
 docs/examples/tutorial/document_mobject.py    |   2 +
 docs/examples/tutorial/example.py             |   2 +
 .../tutorial/instruct_validate_repair.py      |   2 +
 .../tutorial/model_options_example.py         |   2 +
 .../examples/tutorial/sentiment_classifier.py |   2 +
 docs/examples/tutorial/simple_email.py        |   2 +
 docs/examples/tutorial/table_mobject.py       |   2 +
 docs/tutorial.md                              |  16 +-
 mellea/backends/tools.py                      |   6 +-
 mellea/backends/watsonx.py                    |   2 +-
 pyproject.toml                                |   8 +
 test/MARKERS_GUIDE.md                         |  23 +-
 test/backends/test_vision_openai.py           |   1 +
 test/stdlib/components/intrinsic/test_rag.py  |   1 +
 test/stdlib/test_session.py                   |   2 +
 uv.lock                                       |  16 +-
 72 files changed, 569 insertions(+), 23 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 6cb66905..60d03a21 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -11,8 +11,10 @@ AGENTS.md — Instructions for AI coding assistants (Claude, Cursor, Copilot, Co
 pre-commit install                  # Required: install git hooks
 uv sync --all-extras --all-groups   # Install all deps (required for tests)
 ollama serve                        # Start Ollama (required for most tests)
-uv run pytest -m "not qualitative"  # Skips LLM quality tests (~2 min)
-uv run pytest                       # Full suite (includes LLM quality tests)
+uv run pytest                       # Default: qualitative tests, skip slow tests
+uv run pytest -m "not qualitative"  # Fast tests only (~2 min)
+uv run pytest -m slow               # Run only slow tests (>5 min)
+pytest                              # Run ALL tests including slow (no config)
 uv run ruff format . && uv run ruff check .  # Lint & format
 ```
 **Branches**: `feat/topic`, `fix/issue-id`, `docs/topic`
@@ -28,10 +30,41 @@ uv run ruff format . && uv run ruff check .  # Lint & format
 | `scratchpad/` | Experiments (git-ignored) |
 
 ## 3. Test Markers
+All tests and examples use markers to indicate requirements. The test infrastructure automatically skips tests based on system capabilities.
+
+**Backend Markers:**
+- `@pytest.mark.ollama` — Requires Ollama running (local, lightweight)
+- `@pytest.mark.huggingface` — Requires HuggingFace backend (local, heavy)
+- `@pytest.mark.vllm` — Requires vLLM backend (local, GPU required)
+- `@pytest.mark.openai` — Requires OpenAI API (requires API key)
+- `@pytest.mark.watsonx` — Requires Watsonx API (requires API key)
+- `@pytest.mark.litellm` — Requires LiteLLM backend
+
+**Capability Markers:**
+- `@pytest.mark.requires_gpu` — Requires GPU
+- `@pytest.mark.requires_heavy_ram` — Requires 48GB+ RAM
+- `@pytest.mark.requires_api_key` — Requires external API keys
 - `@pytest.mark.qualitative` — LLM output quality tests (skipped in CI via `CICD=1`)
-- **Unmarked** — Unit tests (may still require Ollama running locally)
+- `@pytest.mark.llm` — Makes LLM calls (needs at least Ollama)
+- `@pytest.mark.slow` — Tests taking >5 minutes (skipped via `SKIP_SLOW=1`)
+
+**Examples in `docs/examples/`** use comment-based markers for clean code:
+```python
+# pytest: ollama, llm, requires_heavy_ram
+"""Example description..."""
+
+# Your clean example code here
+```
+
+Tests/examples automatically skip if system lacks required resources. Heavy examples (e.g., HuggingFace) are skipped during collection to prevent memory issues.
+
+**Default behavior:**
+- `uv run pytest` skips slow tests (>5 min) but runs qualitative tests
+- Use `pytest -m "not qualitative"` for fast tests only (~2 min)
+- Use `pytest -m slow` or `pytest` (without config) to include slow tests
 
 ⚠️ Don't add `qualitative` to trivial tests—keep the fast loop fast.
+⚠️ Mark tests taking >5 minutes with `slow` (e.g., dataset loading, extensive evaluations).
 
 ## 4. Coding Standards
 - **Types required** on all core functions
diff --git a/README.md b/README.md
index e1093e71..3a2305cc 100644
--- a/README.md
+++ b/README.md
@@ -264,9 +264,25 @@ uv pip install -e ".[all]" --group dev
 pre-commit install
 ```
 
-You can then run all tests by running `pytest`, or only the CI/CD tests by
-running `CICD=1 pytest`. See [test/MARKERS_GUIDE.md](test/MARKERS_GUIDE.md) for
-details on running specific test categories (e.g., by backend, resource requirements).
+You can then run tests:
+```bash
+# Default: qualitative tests, skip slow tests
+uv run pytest
+
+# Fast tests only (no qualitative, no slow)
+uv run pytest -m "not qualitative"
+
+# Run only slow tests
+uv run pytest -m slow
+
+# Run ALL tests including slow (bypass config)
+pytest --co -q
+
+# CI/CD mode (skips qualitative tests)
+CICD=1 uv run pytest
+```
+
+See [test/MARKERS_GUIDE.md](test/MARKERS_GUIDE.md) for details on running specific test categories (e.g., by backend, resource requirements).
 
 Tip: you can bypass the hooks by passing the `-n` flag to `git commit`.
 This is sometimes helpful for intermediate commits that you intend to later
diff --git a/docs/examples/aLora/101_example.py b/docs/examples/aLora/101_example.py
index 1b65509f..9497b9b9 100644
--- a/docs/examples/aLora/101_example.py
+++ b/docs/examples/aLora/101_example.py
@@ -1,3 +1,5 @@
+# pytest: huggingface, requires_heavy_ram, llm
+
 import time
 
 from mellea import MelleaSession
diff --git a/docs/examples/agents/react.py b/docs/examples/agents/react.py
index 117f1440..1debc044 100644
--- a/docs/examples/agents/react.py
+++ b/docs/examples/agents/react.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 import datetime
 import inspect
 import json
diff --git a/docs/examples/agents/react_instruct.py b/docs/examples/agents/react_instruct.py
index b72adbc6..47dfa2b9 100644
--- a/docs/examples/agents/react_instruct.py
+++ b/docs/examples/agents/react_instruct.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 import datetime
 import inspect
 import json
diff --git a/docs/examples/conftest.py b/docs/examples/conftest.py
index bef7dce6..28658770 100644
--- a/docs/examples/conftest.py
+++ b/docs/examples/conftest.py
@@ -3,22 +3,176 @@
 To run notebooks, use: uv run --with 'mcp' pytest --nbmake docs/examples/notebooks/
 """
 
+import ast
+import os
 import pathlib
 import subprocess
 import sys
 
 import pytest
 
+# Lazy import of system capability detection to avoid circular imports
+_get_system_capabilities = None
+
+
+def get_system_capabilities():
+    """Lazy load system capabilities from test/conftest.py."""
+    global _get_system_capabilities
+
+    if _get_system_capabilities is not None:
+        return _get_system_capabilities()
+
+    # Add test directory to path to enable import
+    _test_dir = pathlib.Path(__file__).parent.parent.parent / "test"
+    _test_dir_abs = _test_dir.resolve()
+    if str(_test_dir_abs) not in sys.path:
+        sys.path.insert(0, str(_test_dir_abs))
+
+    try:
+        # Import with explicit module name to avoid conflicts
+        import importlib.util
+
+        spec = importlib.util.spec_from_file_location(
+            "test_conftest", _test_dir_abs / "conftest.py"
+        )
+        if spec and spec.loader:
+            test_conftest = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(test_conftest)
+            _get_system_capabilities = test_conftest.get_system_capabilities
+            return _get_system_capabilities()
+        else:
+            raise ImportError("Could not load test/conftest.py")
+    except (ImportError, AttributeError) as e:
+        # Fallback if test/conftest.py not available
+        import warnings
+
+        warnings.warn(
+            f"Could not import get_system_capabilities from test/conftest.py: {e}. Heavy RAM tests will NOT be skipped!"
+        )
+
+        def fallback():
+            return {
+                "has_gpu": False,
+                "gpu_memory_gb": 0,
+                "ram_gb": 0,
+                "has_api_keys": {},
+                "has_ollama": False,
+            }
+
+        _get_system_capabilities = fallback
+        return fallback()
+
+
 examples_to_skip = {
-    "101_example.py",
     "__init__.py",
     "simple_rag_with_filter.py",
     "mcp_example.py",
     "client.py",
     "pii_serve.py",
+    "mellea_pdf.py",  # External URL returns 403 Forbidden
 }
 
 
+def _extract_markers_from_file(file_path):
+    """Extract pytest markers from comment in file without parsing Python.
+
+    Looks for lines like: # pytest: marker1, marker2, marker3
+    Returns list of marker names.
+    """
+    try:
+        with open(file_path) as f:
+            for line in f:
+                line = line.strip()
+                if line.startswith("# pytest:"):
+                    marker_text = line[9:].strip()  # Remove "# pytest:"
+                    return [m.strip() for m in marker_text.split(",") if m.strip()]
+                # Stop after first few lines (markers should be at top)
+                if (
+                    len(line) > 0
+                    and not line.startswith("#")
+                    and not line.startswith('"""')
+                ):
+                    break
+    except Exception:
+        pass
+    return []
+
+
+def _should_skip_collection(markers):
+    """Check if example should be skipped during collection based on markers.
+
+    Returns (should_skip, reason) tuple.
+    """
+    if not markers:
+        return False, None
+
+    try:
+        capabilities = get_system_capabilities()
+    except Exception:
+        # If we can't get capabilities, don't skip (fail open)
+        return False, None
+
+    gh_run = int(os.environ.get("CICD", 0))
+
+    # Skip qualitative tests in CI
+    if "qualitative" in markers and gh_run == 1:
+        return True, "Skipping qualitative test in CI (CICD=1)"
+
+    # Skip slow tests if SKIP_SLOW=1 environment variable is set
+    if "slow" in markers and int(os.environ.get("SKIP_SLOW", 0)) == 1:
+        return True, "Skipping slow test (SKIP_SLOW=1)"
+
+    # Skip tests requiring heavy RAM if insufficient
+    if "requires_heavy_ram" in markers:
+        RAM_THRESHOLD_GB = 48
+        if capabilities["ram_gb"] > 0 and capabilities["ram_gb"] < RAM_THRESHOLD_GB:
+            return (
+                True,
+                f"Insufficient RAM ({capabilities['ram_gb']:.1f}GB < {RAM_THRESHOLD_GB}GB)",
+            )
+
+    # Skip tests requiring GPU if not available
+    if "requires_gpu" in markers or "vllm" in markers:
+        if not capabilities["has_gpu"]:
+            return True, "GPU not available"
+
+    # Skip tests requiring Ollama if not available
+    if "ollama" in markers:
+        if not capabilities["has_ollama"]:
+            return True, "Ollama not available (port 11434 not listening)"
+
+    # Skip tests requiring API keys
+    if "requires_api_key" in markers or "watsonx" in markers:
+        if "watsonx" in markers and not capabilities["has_api_keys"].get("watsonx"):
+            return True, "Watsonx API credentials not found"
+        if "openai" in markers and not capabilities["has_api_keys"].get("openai"):
+            return True, "OpenAI API key not found"
+
+    return False, None
+
+
+def _check_optional_imports(file_path):
+    """Check if file has optional imports that aren't installed.
+
+    Returns (should_skip, reason) tuple.
+    """
+    try:
+        with open(file_path) as f:
+            content = f.read()
+
+        # Check for langchain imports
+        if "from langchain" in content or "import langchain" in content:
+            try:
+                import langchain_core
+            except ImportError:
+                return True, "langchain_core not installed"
+
+    except Exception:
+        pass
+
+    return False, None
+
+
 def pytest_terminal_summary(terminalreporter, exitstatus, config):
     # Append the skipped examples if needed.
     if len(examples_to_skip) == 0:
@@ -26,11 +180,69 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config):
 
     terminalreporter.ensure_newline()
     terminalreporter.section("Skipped Examples", sep="=", blue=True, bold=True)
+    newline = "\n"
     terminalreporter.line(
-        f"Examples with the following names were skipped because they cannot be easily run in the pytest framework; please run them manually:\n{'\n'.join(examples_to_skip)}"
+        f"Examples with the following names were skipped because they cannot be easily run in the pytest framework; please run them manually:\n{newline.join(examples_to_skip)}"
     )
 
 
+def pytest_ignore_collect(collection_path, path, config):
+    """Ignore files before pytest even tries to parse them.
+
+    This is called BEFORE pytest_collect_file, so we can prevent
+    heavy files from being parsed at all.
+    """
+    # Skip conftest.py itself - it's not a test
+    if collection_path.name == "conftest.py":
+        return True
+
+    # Only check Python files in docs/examples
+    if (
+        collection_path.suffix == ".py"
+        and "docs" in collection_path.parts
+        and "examples" in collection_path.parts
+    ):
+        # Skip files in the manual skip list
+        if collection_path.name in examples_to_skip:
+            return True
+
+        # Extract markers and check if we should skip
+        try:
+            markers = _extract_markers_from_file(collection_path)
+            should_skip, reason = _should_skip_collection(markers)
+            if should_skip:
+                # Return True to ignore this file completely
+                return True
+        except Exception:
+            # If anything goes wrong, don't skip
+            pass
+
+    return False
+
+
+def pytest_pycollect_makemodule(module_path, path, parent):
+    """Prevent pytest from importing Python modules as test modules.
+
+    This hook is called BEFORE pytest tries to import the module,
+    so we can prevent import errors from optional dependencies.
+    """
+    # Only handle files in docs/examples
+    if (
+        module_path.suffix == ".py"
+        and "docs" in module_path.parts
+        and "examples" in module_path.parts
+    ):
+        # Check for optional imports
+        should_skip, reason = _check_optional_imports(module_path)
+        if should_skip:
+            # Add to skip list and return None to prevent module creation
+            examples_to_skip.add(module_path.name)
+            return None
+
+    # Return None to let pytest handle it normally
+    return None
+
+
 # This doesn't replace the existing pytest file collection behavior.
 def pytest_collect_file(parent: pytest.Dir, file_path: pathlib.PosixPath):
     # Do a quick check that it's a .py file in the expected `docs/examples` folder. We can make
@@ -44,6 +256,11 @@ def pytest_collect_file(parent: pytest.Dir, file_path: pathlib.PosixPath):
         if file_path.name in examples_to_skip:
             return
 
+        # Check for optional imports before creating ExampleFile
+        should_skip, reason = _check_optional_imports(file_path)
+        if should_skip:
+            return None
+
         return ExampleFile.from_parent(parent, path=file_path)
 
 
@@ -82,9 +299,19 @@ def runtest(self):
             stderr = process.stderr.read()
 
         if retcode != 0:
-            raise ExampleTestException(
-                (f"Example failed with exit code {retcode}.\nStderr: {stderr}\n")
-            )
+            # Check if this is a pytest.skip() call (indicated by "Skipped:" in stderr)
+            if "Skipped:" in stderr or "_pytest.outcomes.Skipped" in stderr:
+                # Extract skip reason from stderr
+                skip_reason = "Example skipped"
+                for line in stderr.split("\n"):
+                    if line.startswith("Skipped:"):
+                        skip_reason = line.replace("Skipped:", "").strip()
+                        break
+                pytest.skip(skip_reason)
+            else:
+                raise ExampleTestException(
+                    f"Example failed with exit code {retcode}.\nStderr: {stderr}\n"
+                )
 
     def repr_failure(self, excinfo, style=None):
         """Called when self.runtest() raises an exception."""
@@ -99,3 +326,100 @@ def reportinfo(self):
 
 class ExampleTestException(Exception):
     """Custom exception for error reporting."""
+
+
+def pytest_runtest_setup(item):
+    """Apply skip logic to ExampleItem objects based on system capabilities.
+
+    This ensures examples respect the same capability checks as regular tests
+    (RAM, GPU, Ollama, API keys, etc.).
+    """
+    if not isinstance(item, ExampleItem):
+        return
+
+    # Get system capabilities
+    capabilities = get_system_capabilities()
+
+    # Get gh_run status (CI environment)
+    gh_run = int(os.environ.get("CICD", 0))
+
+    # Get config options (all default to False for examples)
+    ignore_all = False
+    ignore_gpu = False
+    ignore_ram = False
+    ignore_ollama = False
+    ignore_api_key = False
+
+    # Skip qualitative tests in CI
+    if item.get_closest_marker("qualitative") and gh_run == 1:
+        pytest.skip(
+            reason="Skipping qualitative test: got env variable CICD == 1. Used only in gh workflows."
+        )
+
+    # Skip tests requiring API keys if not available
+    if item.get_closest_marker("requires_api_key") and not ignore_api_key:
+        for backend in ["openai", "watsonx"]:
+            if item.get_closest_marker(backend):
+                if not capabilities["has_api_keys"].get(backend):
+                    pytest.skip(
+                        f"Skipping test: {backend} API key not found in environment"
+                    )
+
+    # Skip tests requiring GPU if not available
+    if item.get_closest_marker("requires_gpu") and not ignore_gpu:
+        if not capabilities["has_gpu"]:
+            pytest.skip("Skipping test: GPU not available")
+
+    # Skip tests requiring heavy RAM if insufficient
+    if item.get_closest_marker("requires_heavy_ram") and not ignore_ram:
+        RAM_THRESHOLD_GB = 48  # Based on real-world testing
+        if capabilities["ram_gb"] > 0 and capabilities["ram_gb"] < RAM_THRESHOLD_GB:
+            pytest.skip(
+                f"Skipping test: Insufficient RAM ({capabilities['ram_gb']:.1f}GB < {RAM_THRESHOLD_GB}GB)"
+            )
+
+    # Backend-specific skipping
+    if item.get_closest_marker("watsonx") and not ignore_api_key:
+        if not capabilities["has_api_keys"].get("watsonx"):
+            pytest.skip(
+                "Skipping test: Watsonx API credentials not found in environment"
+            )
+
+    if item.get_closest_marker("vllm") and not ignore_gpu:
+        if not capabilities["has_gpu"]:
+            pytest.skip("Skipping test: vLLM requires GPU")
+
+    if item.get_closest_marker("ollama") and not ignore_ollama:
+        if not capabilities["has_ollama"]:
+            pytest.skip(
+                "Skipping test: Ollama not available (port 11434 not listening)"
+            )
+
+
+def pytest_collection_modifyitems(items):
+    """Apply markers from example files to ExampleItem objects.
+
+    Parses comment-based markers from example files in the format:
+        # pytest: marker1, marker2, marker3
+
+    This keeps examples clean while allowing intelligent test skipping.
+    """
+    for item in items:
+        if isinstance(item, ExampleItem):
+            # Read the file and look for comment-based markers
+            try:
+                with open(item.path) as f:
+                    for line in f:
+                        line = line.strip()
+                        # Look for comment-based marker line
+                        if line.startswith("# pytest:"):
+                            # Extract markers after "# pytest:"
+                            marker_text = line[9:].strip()  # Remove "# pytest:"
+                            markers = [m.strip() for m in marker_text.split(",")]
+                            for marker_name in markers:
+                                if marker_name:  # Skip empty strings
+                                    item.add_marker(getattr(pytest.mark, marker_name))
+                            break  # Only process first pytest comment line
+            except Exception:
+                # If we can't parse the file, skip marker application
+                pass
diff --git a/docs/examples/context/contexts_with_sampling.py b/docs/examples/context/contexts_with_sampling.py
index 1f71397b..35d69d45 100644
--- a/docs/examples/context/contexts_with_sampling.py
+++ b/docs/examples/context/contexts_with_sampling.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 from mellea.stdlib.sampling import RejectionSamplingStrategy
 from mellea.stdlib.session import start_session
 
diff --git a/docs/examples/generative_slots/generate_with_context.py b/docs/examples/generative_slots/generate_with_context.py
index 8d4d3c87..1cc35262 100644
--- a/docs/examples/generative_slots/generate_with_context.py
+++ b/docs/examples/generative_slots/generate_with_context.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 from mellea import generative, start_session
 from mellea.backends import ModelOption
 from mellea.core import CBlock
diff --git a/docs/examples/generative_slots/generative_gsm8k.py b/docs/examples/generative_slots/generative_gsm8k.py
index 4334e1d0..efaf1e0d 100644
--- a/docs/examples/generative_slots/generative_gsm8k.py
+++ b/docs/examples/generative_slots/generative_gsm8k.py
@@ -1,3 +1,5 @@
+# pytest: ollama, qualitative, llm, slow
+
 """Example of chain-of-thought reasoning on a mathematical question from the GSM8K dataset, structured as code for improved performance with the granite-3.3-8B model. The original accuracy in standard "thinking" mode is approximately 80%, while this implementation achieves 85-89% accuracy—up to 9 points higher.
 
 This demonstrates that generative decorators are sufficient for complex reasoning tasks: not only do they maintain or improve performance, but they also significantly enhance observability and control. For instance, the structured Thought titles can be easily surfaced in a UI, providing instant insight into the model's reasoning process.
diff --git a/docs/examples/generative_slots/generative_slots.py b/docs/examples/generative_slots/generative_slots.py
index 2e1f5e40..c5d24f4e 100644
--- a/docs/examples/generative_slots/generative_slots.py
+++ b/docs/examples/generative_slots/generative_slots.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 from typing import Literal
 
 from mellea import generative, start_session
diff --git a/docs/examples/generative_slots/generative_slots_with_requirements.py b/docs/examples/generative_slots/generative_slots_with_requirements.py
index 6f5a610a..4f65c796 100644
--- a/docs/examples/generative_slots/generative_slots_with_requirements.py
+++ b/docs/examples/generative_slots/generative_slots_with_requirements.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 from typing import Literal
 
 from mellea import generative, start_session
diff --git a/docs/examples/generative_slots/inter_module_composition/decision_aides.py b/docs/examples/generative_slots/inter_module_composition/decision_aides.py
index 01e5d8f8..b5edc005 100644
--- a/docs/examples/generative_slots/inter_module_composition/decision_aides.py
+++ b/docs/examples/generative_slots/inter_module_composition/decision_aides.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 from mellea import generative
 
 
diff --git a/docs/examples/generative_slots/inter_module_composition/summarize_and_decide.py b/docs/examples/generative_slots/inter_module_composition/summarize_and_decide.py
index 7184cf9f..3a5d91ad 100644
--- a/docs/examples/generative_slots/inter_module_composition/summarize_and_decide.py
+++ b/docs/examples/generative_slots/inter_module_composition/summarize_and_decide.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 from typing import Literal
 
 from decision_aides import (
diff --git a/docs/examples/generative_slots/inter_module_composition/summarizers.py b/docs/examples/generative_slots/inter_module_composition/summarizers.py
index a35285e7..01656db3 100644
--- a/docs/examples/generative_slots/inter_module_composition/summarizers.py
+++ b/docs/examples/generative_slots/inter_module_composition/summarizers.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 from mellea import generative
 
 
diff --git a/docs/examples/generative_slots/investment_advice.py b/docs/examples/generative_slots/investment_advice.py
index c1768b8b..7685e197 100644
--- a/docs/examples/generative_slots/investment_advice.py
+++ b/docs/examples/generative_slots/investment_advice.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 from typing import Literal
 
 from mellea import MelleaSession, generative, start_session
diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py
index 9374b73a..d00766d6 100644
--- a/docs/examples/hello_world.py
+++ b/docs/examples/hello_world.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 import mellea
 
 m = mellea.start_session()
diff --git a/docs/examples/helper/helpers.py b/docs/examples/helper/helpers.py
index ad8a5a3d..726f0ec0 100644
--- a/docs/examples/helper/helpers.py
+++ b/docs/examples/helper/helpers.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 from textwrap import fill
 from typing import Any
 
diff --git a/docs/examples/image_text_models/vision_litellm_backend.py b/docs/examples/image_text_models/vision_litellm_backend.py
index 03f1ea1b..5ca65d93 100644
--- a/docs/examples/image_text_models/vision_litellm_backend.py
+++ b/docs/examples/image_text_models/vision_litellm_backend.py
@@ -1,3 +1,5 @@
+# pytest: litellm, llm
+
 """Examples of using vision models with LiteLLM backend."""
 
 import os
diff --git a/docs/examples/image_text_models/vision_ollama_chat.py b/docs/examples/image_text_models/vision_ollama_chat.py
index 49fb1198..b3b3fc04 100644
--- a/docs/examples/image_text_models/vision_ollama_chat.py
+++ b/docs/examples/image_text_models/vision_ollama_chat.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 """Example of using Ollama with vision models with linear context."""
 
 import pathlib
diff --git a/docs/examples/image_text_models/vision_openai_examples.py b/docs/examples/image_text_models/vision_openai_examples.py
index 1ca58658..d79d5fee 100644
--- a/docs/examples/image_text_models/vision_openai_examples.py
+++ b/docs/examples/image_text_models/vision_openai_examples.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm, requires_heavy_ram
+
 """Examples using vision models with OpenAI backend."""
 
 import pathlib
diff --git a/docs/examples/information_extraction/101_with_gen_slots.py b/docs/examples/information_extraction/101_with_gen_slots.py
index 961a5122..bec0557f 100644
--- a/docs/examples/information_extraction/101_with_gen_slots.py
+++ b/docs/examples/information_extraction/101_with_gen_slots.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 """Simple Example of information extraction with Mellea using generative slots."""
 
 from mellea import generative, start_session
diff --git a/docs/examples/information_extraction/advanced_with_m_instruct.py b/docs/examples/information_extraction/advanced_with_m_instruct.py
index d2678952..60537c06 100644
--- a/docs/examples/information_extraction/advanced_with_m_instruct.py
+++ b/docs/examples/information_extraction/advanced_with_m_instruct.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 """Advanced Example of information extraction with Mellea using m.instruct() and constraints."""
 
 from collections.abc import Callable
diff --git a/docs/examples/instruct_validate_repair/101_email.py b/docs/examples/instruct_validate_repair/101_email.py
index eb0bb9a1..6eb905d1 100644
--- a/docs/examples/instruct_validate_repair/101_email.py
+++ b/docs/examples/instruct_validate_repair/101_email.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 # This is the 101 example for using `session` and `instruct`.
 # helper function to wrap text
 from docs.examples.helper import w
diff --git a/docs/examples/instruct_validate_repair/101_email_comparison.py b/docs/examples/instruct_validate_repair/101_email_comparison.py
index 760cea76..e35e676e 100644
--- a/docs/examples/instruct_validate_repair/101_email_comparison.py
+++ b/docs/examples/instruct_validate_repair/101_email_comparison.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 from docs.examples.helper import w
 from mellea import start_session
 from mellea.backends import ModelOption
diff --git a/docs/examples/instruct_validate_repair/101_email_with_requirements.py b/docs/examples/instruct_validate_repair/101_email_with_requirements.py
index 5d9f21a8..21e72615 100644
--- a/docs/examples/instruct_validate_repair/101_email_with_requirements.py
+++ b/docs/examples/instruct_validate_repair/101_email_with_requirements.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 from docs.examples.helper import w
 from mellea import start_session
 from mellea.backends import ModelOption
diff --git a/docs/examples/instruct_validate_repair/101_email_with_validate.py b/docs/examples/instruct_validate_repair/101_email_with_validate.py
index a7a0e500..bf8589c2 100644
--- a/docs/examples/instruct_validate_repair/101_email_with_validate.py
+++ b/docs/examples/instruct_validate_repair/101_email_with_validate.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 from docs.examples.helper import req_print, w
 from mellea import start_session
 from mellea.backends.model_ids import IBM_GRANITE_3_3_8B
diff --git a/docs/examples/instruct_validate_repair/advanced_email_with_validate_function.py b/docs/examples/instruct_validate_repair/advanced_email_with_validate_function.py
index 0ffe0d13..4bdc9793 100644
--- a/docs/examples/instruct_validate_repair/advanced_email_with_validate_function.py
+++ b/docs/examples/instruct_validate_repair/advanced_email_with_validate_function.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 from docs.examples.helper import w
 from mellea import start_session
 from mellea.backends import ModelOption
diff --git a/docs/examples/intrinsics/answer_relevance.py b/docs/examples/intrinsics/answer_relevance.py
index f945c6dd..6d10c048 100644
--- a/docs/examples/intrinsics/answer_relevance.py
+++ b/docs/examples/intrinsics/answer_relevance.py
@@ -1,3 +1,5 @@
+# pytest: huggingface, requires_heavy_ram, llm
+
 """
 Example usage of the answer relevance intrinsic for RAG applications.
 
diff --git a/docs/examples/intrinsics/answerability.py b/docs/examples/intrinsics/answerability.py
index 6804c5d7..c7f2e03d 100644
--- a/docs/examples/intrinsics/answerability.py
+++ b/docs/examples/intrinsics/answerability.py
@@ -1,3 +1,5 @@
+# pytest: huggingface, requires_heavy_ram, llm
+
 """
 Example usage of the answerability intrinsic for RAG applications.
 
diff --git a/docs/examples/intrinsics/citations.py b/docs/examples/intrinsics/citations.py
index 74377091..f7a0b6b8 100644
--- a/docs/examples/intrinsics/citations.py
+++ b/docs/examples/intrinsics/citations.py
@@ -1,3 +1,5 @@
+# pytest: huggingface, requires_heavy_ram, llm
+
 """
 Example usage of the citations intrinsic for RAG applications.
 
diff --git a/docs/examples/intrinsics/context_relevance.py b/docs/examples/intrinsics/context_relevance.py
index 470973e3..0f5c8eb3 100644
--- a/docs/examples/intrinsics/context_relevance.py
+++ b/docs/examples/intrinsics/context_relevance.py
@@ -1,3 +1,5 @@
+# pytest: huggingface, requires_heavy_ram, llm
+
 """
 Example usage of the context relevance intrinsic for RAG applications.
 
diff --git a/docs/examples/intrinsics/hallucination_detection.py b/docs/examples/intrinsics/hallucination_detection.py
index 271e76a3..0adfe38e 100644
--- a/docs/examples/intrinsics/hallucination_detection.py
+++ b/docs/examples/intrinsics/hallucination_detection.py
@@ -1,3 +1,5 @@
+# pytest: huggingface, requires_heavy_ram, llm
+
 """
 Example usage of the hallucination detection intrinsic for RAG applications.
 
diff --git a/docs/examples/intrinsics/intrinsics.py b/docs/examples/intrinsics/intrinsics.py
index 10ba4e97..70a71671 100644
--- a/docs/examples/intrinsics/intrinsics.py
+++ b/docs/examples/intrinsics/intrinsics.py
@@ -1,3 +1,5 @@
+# pytest: huggingface, requires_heavy_ram, llm
+
 from mellea.backends.huggingface import LocalHFBackend
 from mellea.backends.openai import OpenAIBackend, _ServerType
 from mellea.backends.adapters.adapter import AdapterType, GraniteCommonAdapter
diff --git a/docs/examples/intrinsics/query_rewrite.py b/docs/examples/intrinsics/query_rewrite.py
index a95cadc7..7a6f3c56 100644
--- a/docs/examples/intrinsics/query_rewrite.py
+++ b/docs/examples/intrinsics/query_rewrite.py
@@ -1,3 +1,5 @@
+# pytest: huggingface, requires_heavy_ram, llm
+
 """
 Example usage of the query rewrite intrinsic for RAG applications.
 
diff --git a/docs/examples/library_interop/langchain_messages.py b/docs/examples/library_interop/langchain_messages.py
index 8d99720d..4e0b02fb 100644
--- a/docs/examples/library_interop/langchain_messages.py
+++ b/docs/examples/library_interop/langchain_messages.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 # Installing langchain is necessary for this example, but it works for any library
 # you may want to use Mellea with.
 from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
diff --git a/docs/examples/m_serve/m_serve_example_simple.py b/docs/examples/m_serve/m_serve_example_simple.py
index f1dff480..2a717574 100644
--- a/docs/examples/m_serve/m_serve_example_simple.py
+++ b/docs/examples/m_serve/m_serve_example_simple.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 """Example to run m serve."""
 
 from typing import Any
diff --git a/docs/examples/melp/lazy.py b/docs/examples/melp/lazy.py
index 3a3b9e5f..4d293515 100644
--- a/docs/examples/melp/lazy.py
+++ b/docs/examples/melp/lazy.py
@@ -1,3 +1,5 @@
+# pytest: ollama, qualitative, llm
+
 import asyncio
 from mellea.core import Context, CBlock, ModelOutputThunk
 
diff --git a/docs/examples/melp/lazy_fib.py b/docs/examples/melp/lazy_fib.py
index e91a4a2b..1383b410 100644
--- a/docs/examples/melp/lazy_fib.py
+++ b/docs/examples/melp/lazy_fib.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 import asyncio
 from mellea.core import Context, CBlock, ModelOutputThunk
 
diff --git a/docs/examples/melp/lazy_fib_sample.py b/docs/examples/melp/lazy_fib_sample.py
index 0224f4a3..1e1e5611 100644
--- a/docs/examples/melp/lazy_fib_sample.py
+++ b/docs/examples/melp/lazy_fib_sample.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 import asyncio
 from mellea.core import Context, CBlock, ModelOutputThunk
 
diff --git a/docs/examples/melp/simple_example.py b/docs/examples/melp/simple_example.py
index 7862027e..e1c38c67 100644
--- a/docs/examples/melp/simple_example.py
+++ b/docs/examples/melp/simple_example.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 import asyncio
 from mellea.core import Context, CBlock, ModelOutputThunk, Backend
 from mellea.backends.ollama import OllamaModelBackend
diff --git a/docs/examples/melp/states.py b/docs/examples/melp/states.py
index d8770c3a..efbd5e07 100644
--- a/docs/examples/melp/states.py
+++ b/docs/examples/melp/states.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 import asyncio
 
 from mellea.core import Context, CBlock, Backend
diff --git a/docs/examples/mify/mify.py b/docs/examples/mify/mify.py
index 3d6b81eb..0d4eab88 100644
--- a/docs/examples/mify/mify.py
+++ b/docs/examples/mify/mify.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 from mellea.stdlib.components.docs.richdocument import TableQuery
 from mellea.stdlib.components.mify import MifiedProtocol, mify
 from mellea.stdlib.session import start_session
diff --git a/docs/examples/mify/rich_document_advanced.py b/docs/examples/mify/rich_document_advanced.py
index b186218f..1543fed9 100644
--- a/docs/examples/mify/rich_document_advanced.py
+++ b/docs/examples/mify/rich_document_advanced.py
@@ -1,3 +1,5 @@
+# pytest: huggingface, requires_heavy_ram, llm
+
 # ruff: noqa E402
 # Example: Rich Documents and Templating
 # Lets look at how to pass documents to a model using `Mellea`.
diff --git a/docs/examples/mify/rich_table_execute_basic.py b/docs/examples/mify/rich_table_execute_basic.py
index a9a5c112..f1365a06 100644
--- a/docs/examples/mify/rich_table_execute_basic.py
+++ b/docs/examples/mify/rich_table_execute_basic.py
@@ -1,3 +1,5 @@
+# pytest: ollama, qualitative, llm
+
 # This is an example of using the RichDocument class.
 import os
 
diff --git a/docs/examples/mini_researcher/context_docs.py b/docs/examples/mini_researcher/context_docs.py
index ce326e4e..e4d2d900 100644
--- a/docs/examples/mini_researcher/context_docs.py
+++ b/docs/examples/mini_researcher/context_docs.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 from dataclasses import dataclass
 
 from docs.examples.mini_researcher import RAGDocument
diff --git a/docs/examples/mini_researcher/researcher.py b/docs/examples/mini_researcher/researcher.py
index 3fe3182a..2d092723 100644
--- a/docs/examples/mini_researcher/researcher.py
+++ b/docs/examples/mini_researcher/researcher.py
@@ -1,3 +1,5 @@
+# pytest: ollama, qualitative, llm
+
 from collections.abc import Callable
 from functools import cache
 
diff --git a/docs/examples/mobject/table.py b/docs/examples/mobject/table.py
index 03788882..93234a06 100644
--- a/docs/examples/mobject/table.py
+++ b/docs/examples/mobject/table.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 from io import StringIO
 
 import pandas
diff --git a/docs/examples/rag/mellea_pdf.py b/docs/examples/rag/mellea_pdf.py
index 5f668e6c..b304839a 100644
--- a/docs/examples/rag/mellea_pdf.py
+++ b/docs/examples/rag/mellea_pdf.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm, qualitative
+
 import mellea
 from mellea.stdlib.components.docs.richdocument import RichDocument
 
diff --git a/docs/examples/safety/guardian.py b/docs/examples/safety/guardian.py
index c5b8b123..33913921 100644
--- a/docs/examples/safety/guardian.py
+++ b/docs/examples/safety/guardian.py
@@ -1,3 +1,5 @@
+# pytest: huggingface, requires_heavy_ram, llm
+
 """Example of using the Enhanced Guardian Requirement with Granite Guardian 3.3 8B"""
 
 from mellea import MelleaSession
diff --git a/docs/examples/safety/guardian_huggingface.py b/docs/examples/safety/guardian_huggingface.py
index bbb84698..dceba917 100644
--- a/docs/examples/safety/guardian_huggingface.py
+++ b/docs/examples/safety/guardian_huggingface.py
@@ -1,3 +1,5 @@
+# pytest: huggingface, requires_heavy_ram, llm
+
 """Example of using GuardianCheck with HuggingFace backend for direct model inference
 
 This example shows how to reuse the Guardian backend across multiple validators
diff --git a/docs/examples/safety/repair_with_guardian.py b/docs/examples/safety/repair_with_guardian.py
index 2355eff5..7c2a0c96 100644
--- a/docs/examples/safety/repair_with_guardian.py
+++ b/docs/examples/safety/repair_with_guardian.py
@@ -1,3 +1,5 @@
+# pytest: huggingface, requires_heavy_ram, llm
+
 """
 RepairTemplateStrategy Example with Actual Function Call Validation
 Demonstrates how RepairTemplateStrategy repairs responses using actual function calls.
diff --git a/docs/examples/sessions/creating_a_new_type_of_session.py b/docs/examples/sessions/creating_a_new_type_of_session.py
index 59624caf..85aec63c 100644
--- a/docs/examples/sessions/creating_a_new_type_of_session.py
+++ b/docs/examples/sessions/creating_a_new_type_of_session.py
@@ -1,3 +1,5 @@
+# pytest: ollama, qualitative, llm
+
 from typing import Literal
 from PIL import Image as PILImage
 
diff --git a/docs/examples/sofai/sofai_graph_coloring.py b/docs/examples/sofai/sofai_graph_coloring.py
index 681732c6..134425c5 100644
--- a/docs/examples/sofai/sofai_graph_coloring.py
+++ b/docs/examples/sofai/sofai_graph_coloring.py
@@ -1,3 +1,5 @@
+# pytest: ollama, qualitative, llm, requires_heavy_ram
+
 """SOFAI Sampling Strategy Example: Graph Coloring Problem.
 
 This example demonstrates the SOFAI (Slow and Fast AI)
diff --git a/docs/examples/tools/interpreter_example.py b/docs/examples/tools/interpreter_example.py
index b2a9315b..1e7e3ec8 100644
--- a/docs/examples/tools/interpreter_example.py
+++ b/docs/examples/tools/interpreter_example.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 from mellea.stdlib.tools import code_interpreter, local_code_interpreter
 from mellea import start_session, MelleaSession
 from mellea.backends import ModelOption
diff --git a/docs/examples/tutorial/compositionality_with_generative_slots.py b/docs/examples/tutorial/compositionality_with_generative_slots.py
index 8ba09f1f..a8b7c9d5 100644
--- a/docs/examples/tutorial/compositionality_with_generative_slots.py
+++ b/docs/examples/tutorial/compositionality_with_generative_slots.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 from mellea import generative
 
 
diff --git a/docs/examples/tutorial/context_example.py b/docs/examples/tutorial/context_example.py
index e98e1182..313011cb 100644
--- a/docs/examples/tutorial/context_example.py
+++ b/docs/examples/tutorial/context_example.py
@@ -1,5 +1,7 @@
+# pytest: ollama, llm
+
 from mellea import start_session
-from mellea.stdlib.base import ChatContext
+from mellea.stdlib.context import ChatContext
 
 m = start_session(ctx=ChatContext())
 m.chat("Make up a math problem.")
diff --git a/docs/examples/tutorial/document_mobject.py b/docs/examples/tutorial/document_mobject.py
index 42c18cb1..9be644f3 100644
--- a/docs/examples/tutorial/document_mobject.py
+++ b/docs/examples/tutorial/document_mobject.py
@@ -1,3 +1,5 @@
+# pytest: ollama, qualitative, llm, requires_heavy_ram
+
 from mellea.backends import model_ids
 from mellea.backends.model_ids import IBM_GRANITE_3_3_8B
 from mellea.stdlib.components.docs.richdocument import RichDocument
diff --git a/docs/examples/tutorial/example.py b/docs/examples/tutorial/example.py
index 6212a9e8..eb8fdf21 100644
--- a/docs/examples/tutorial/example.py
+++ b/docs/examples/tutorial/example.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 import mellea
 
 m = mellea.start_session()
diff --git a/docs/examples/tutorial/instruct_validate_repair.py b/docs/examples/tutorial/instruct_validate_repair.py
index 76113d0f..6295358b 100644
--- a/docs/examples/tutorial/instruct_validate_repair.py
+++ b/docs/examples/tutorial/instruct_validate_repair.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 from mellea.stdlib.requirements import check, req, simple_validate
 
 requirements = [
diff --git a/docs/examples/tutorial/model_options_example.py b/docs/examples/tutorial/model_options_example.py
index 7eb88b9a..2ee1ed2b 100644
--- a/docs/examples/tutorial/model_options_example.py
+++ b/docs/examples/tutorial/model_options_example.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 import mellea
 from mellea.backends import model_ids
 from mellea.backends.ollama import OllamaModelBackend
diff --git a/docs/examples/tutorial/sentiment_classifier.py b/docs/examples/tutorial/sentiment_classifier.py
index cd6ded25..d26269ed 100644
--- a/docs/examples/tutorial/sentiment_classifier.py
+++ b/docs/examples/tutorial/sentiment_classifier.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 from typing import Literal
 
 from mellea import generative, start_session
diff --git a/docs/examples/tutorial/simple_email.py b/docs/examples/tutorial/simple_email.py
index 0de7c772..9f71462c 100644
--- a/docs/examples/tutorial/simple_email.py
+++ b/docs/examples/tutorial/simple_email.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 import mellea
 
 # INFO: this line will download IBM's Granite 4 Micro 3B model.
diff --git a/docs/examples/tutorial/table_mobject.py b/docs/examples/tutorial/table_mobject.py
index 1fa0eb34..e4de110f 100644
--- a/docs/examples/tutorial/table_mobject.py
+++ b/docs/examples/tutorial/table_mobject.py
@@ -1,3 +1,5 @@
+# pytest: ollama, llm
+
 from io import StringIO
 
 import pandas
diff --git a/docs/tutorial.md b/docs/tutorial.md
index 6c8c4039..2fa97af3 100644
--- a/docs/tutorial.md
+++ b/docs/tutorial.md
@@ -1436,9 +1436,19 @@ uv pip install -e ".[all]" --group dev
 pre-commit install
 ```
 
-You can then run all tests by running `pytest`, or only the CI/CD tests by
-running `CICD=1 pytest`. See [test/MARKERS_GUIDE.md](../test/MARKERS_GUIDE.md) for
-details on running specific test categories (e.g., by backend, resource requirements).
+You can then run tests:
+```bash
+# Fast tests (default, excludes slow qualitative tests)
+uv run pytest
+
+# Full test suite (includes qualitative tests)
+uv run pytest -m ""
+
+# CI/CD mode (skips qualitative tests even with -m "")
+CICD=1 uv run pytest -m ""
+```
+
+See [test/MARKERS_GUIDE.md](../test/MARKERS_GUIDE.md) for details on running specific test categories (e.g., by backend, resource requirements).
 
 Tip: you can bypass the hooks by passing the `-n` flag to `git commit`.
 This is sometimes helpful for intermediate commits that you intend to later
diff --git a/mellea/backends/tools.py b/mellea/backends/tools.py
index 32b56528..753ad048 100644
--- a/mellea/backends/tools.py
+++ b/mellea/backends/tools.py
@@ -47,8 +47,10 @@ def as_json_tool(self) -> dict[str, Any]:
     def from_langchain(cls, tool: Any):
         """Create a Tool from a langchain tool object."""
         try:
-            from langchain_core.tools import BaseTool
-            from langchain_core.utils.function_calling import convert_to_openai_tool
+            from langchain_core.tools import BaseTool  # type: ignore[import-not-found]
+            from langchain_core.utils.function_calling import (  # type: ignore[import-not-found]
+                convert_to_openai_tool,
+            )
 
             if isinstance(tool, BaseTool):
                 tool_name = tool.name
diff --git a/mellea/backends/watsonx.py b/mellea/backends/watsonx.py
index 31093c02..0fe1c2bd 100644
--- a/mellea/backends/watsonx.py
+++ b/mellea/backends/watsonx.py
@@ -94,7 +94,7 @@ def __init__(
         self._model_id = model_id
 
         if base_url is None:
-            base_url = f"{os.environ.get('WATSONX_URL')}"
+            base_url = os.environ.get("WATSONX_URL")
         if api_key is None:
             api_key = os.environ.get("WATSONX_API_KEY")
 
diff --git a/pyproject.toml b/pyproject.toml
index a9c1ec0f..f53faffc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -114,6 +114,7 @@ dev = [
     "pytest",
     "pytest-asyncio",
     "pytest-cov>=6.0.0",
+    "pytest-timeout",  # For test suite timeout configuration
     "psutil",  # For test infrastructure: RAM detection in conftest.py
     "mypy>=1.17.0",
     "python-semantic-release~=7.32",
@@ -201,6 +202,7 @@ disable_error_code = ["empty-body", "import-untyped"]
 python_version = "3.10"
 
 [tool.pytest.ini_options]
+testpaths = ["test", "docs"]  # Run test/ first (fail fast), then docs/
 markers = [
     # Backend markers
     "ollama: Tests requiring Ollama backend (local, light)",
@@ -215,17 +217,23 @@ markers = [
     "requires_gpu: Tests requiring GPU",
     "requires_heavy_ram: Tests requiring 48GB+ RAM",
     "qualitative: Non-deterministic quality tests",
+    "slow: Tests taking >5 minutes (e.g., dataset loading)",
     
     # Composite markers
     "llm: Tests that make LLM calls (needs at least Ollama)",
 ]
 asyncio_mode = "auto"  # Don't require explicitly marking async tests.
 addopts = [
+    # Skip slow tests by default (use pytest without -m to include them)
+    "-m", "not slow",
+    # Run qualitative tests by default (use -m "not qualitative" for fast tests)
     "--cov=mellea",
     "--cov=cli",
     "--cov-report=term",
     "--cov-report=html",
     "--cov-report=json",
+    # Set timeout to 15 minutes for full test suite
+    "--timeout=900",
 ]
 filterwarnings = [
     # Show each deprecation warning only once per session
diff --git a/test/MARKERS_GUIDE.md b/test/MARKERS_GUIDE.md
index 522507a1..2e4df2ae 100644
--- a/test/MARKERS_GUIDE.md
+++ b/test/MARKERS_GUIDE.md
@@ -62,9 +62,18 @@ pytest --ignore-gpu-check --ignore-ram-check -m "huggingface"
 ## Quick Start
 
 ```bash
-# Run all tests (auto-skips based on your system)
+# Default: qualitative tests, skip slow tests
 pytest
 
+# Fast tests only (no qualitative, no slow)
+pytest -m "not qualitative"
+
+# Run only slow tests
+pytest -m "slow"
+
+# Run ALL tests including slow (bypass config)
+pytest --co -q
+
 # Run only fast unit tests (no LLM calls)
 pytest -m "not llm"
 
@@ -74,13 +83,12 @@ pytest -m "ollama"
 # Run tests that don't require API keys
 pytest -m "not requires_api_key"
 
-# Run infrastructure tests only (skip quality tests)
-pytest -m "not qualitative"
-
 # Run quality tests for Ollama
 pytest -m "ollama and qualitative"
 ```
 
+**Note:** By default, `pytest` excludes slow tests (>5 min) but includes qualitative tests (configured in `pyproject.toml`). Use `pytest --co -q` to run all tests including slow ones.
+
 ## Marker Categories
 
 ### Backend Markers
@@ -140,8 +148,15 @@ Specify resource or authentication requirements:
 
 - **`@pytest.mark.qualitative`**: Non-deterministic quality tests
   - Tests LLM output quality rather than infrastructure
+  - **Included by default** (run with standard `pytest`)
   - Skipped in CI (when `CICD=1`)
   - May be flaky due to model variability
+  - Use `pytest -m "not qualitative"` to exclude these tests
+
+- **`@pytest.mark.slow`**: Tests taking >5 minutes
+  - Tests that load large datasets, run extensive evaluations, etc.
+  - **Excluded by default** (configured in `pyproject.toml` addopts)
+  - Use `pytest -m slow` or `pytest --co -q` to include these tests
 
 ### Composite Markers
 
diff --git a/test/backends/test_vision_openai.py b/test/backends/test_vision_openai.py
index 6285faaf..b712f53c 100644
--- a/test/backends/test_vision_openai.py
+++ b/test/backends/test_vision_openai.py
@@ -126,6 +126,7 @@ def test_image_block_in_instruction(
     assert image_block.value[:100] in image_url["url"]
 
 
+@pytest.mark.qualitative
 def test_image_block_in_chat(
     m_session: MelleaSession, pil_image: Image.Image, gh_run: int
 ):
diff --git a/test/stdlib/components/intrinsic/test_rag.py b/test/stdlib/components/intrinsic/test_rag.py
index 399af90b..633df557 100644
--- a/test/stdlib/components/intrinsic/test_rag.py
+++ b/test/stdlib/components/intrinsic/test_rag.py
@@ -21,6 +21,7 @@
     ),
     pytest.mark.huggingface,
     pytest.mark.requires_gpu,
+    pytest.mark.requires_heavy_ram,  # 3B model + document processing needs ~30-35GB
     pytest.mark.llm,
 ]
 
diff --git a/test/stdlib/test_session.py b/test/stdlib/test_session.py
index 2bace2cd..6fc07d1e 100644
--- a/test/stdlib/test_session.py
+++ b/test/stdlib/test_session.py
@@ -18,6 +18,8 @@ def m_session(gh_run):
     del m
 
 
+@pytest.mark.watsonx
+@pytest.mark.requires_api_key
 def test_start_session_watsonx(gh_run):
     if gh_run == 1:
         pytest.skip("Skipping watsonx tests.")
diff --git a/uv.lock b/uv.lock
index 7ef653bd..1d8fe940 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 2
+revision = 3
 requires-python = ">=3.10"
 resolution-markers = [
     "python_full_version >= '3.14' and python_full_version < '4' and sys_platform == 'darwin'",
@@ -3457,6 +3457,7 @@ dev = [
     { name = "pytest" },
     { name = "pytest-asyncio" },
     { name = "pytest-cov" },
+    { name = "pytest-timeout" },
     { name = "python-semantic-release" },
     { name = "ruff" },
 ]
@@ -3528,6 +3529,7 @@ dev = [
     { name = "pytest" },
     { name = "pytest-asyncio" },
     { name = "pytest-cov", specifier = ">=6.0.0" },
+    { name = "pytest-timeout" },
     { name = "python-semantic-release", specifier = "~=7.32" },
     { name = "ruff", specifier = ">=0.11.6" },
 ]
@@ -5813,6 +5815,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ee/49/1377b49de7d0c1ce41292161ea0f721913fa8722c19fb9c1e3aa0367eecb/pytest_cov-7.0.0-py3-none-any.whl", hash = "sha256:3b8e9558b16cc1479da72058bdecf8073661c7f57f7d3c5f22a1c23507f2d861", size = 22424, upload-time = "2025-09-09T10:57:00.695Z" },
 ]
 
+[[package]]
+name = "pytest-timeout"
+version = "2.4.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pytest" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ac/82/4c9ecabab13363e72d880f2fb504c5f750433b2b6f16e99f4ec21ada284c/pytest_timeout-2.4.0.tar.gz", hash = "sha256:7e68e90b01f9eff71332b25001f85c75495fc4e3a836701876183c4bcfd0540a", size = 17973, upload-time = "2025-05-05T19:44:34.99Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fa/b6/3127540ecdf1464a00e5a01ee60a1b09175f6913f0644ac748494d9c4b21/pytest_timeout-2.4.0-py3-none-any.whl", hash = "sha256:c42667e5cdadb151aeb5b26d114aff6bdf5a907f176a007a30b940d3d865b5c2", size = 14382, upload-time = "2025-05-05T19:44:33.502Z" },
+]
+
 [[package]]
 name = "python-bidi"
 version = "0.6.7"