-
Notifications
You must be signed in to change notification settings - Fork 77
refactor: llguidance #288
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
refactor: llguidance #288
Changes from all commits
21a57ef
b6c1eb3
35d9521
fbd37f1
f3b4591
20ae052
1987115
62923fe
7eee2f1
29588fb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,8 +19,6 @@ | |
| from typing import TYPE_CHECKING, Any, Optional, overload | ||
|
|
||
| import msgspec # type:ignore | ||
| import outlines | ||
| import outlines_core | ||
| import torch | ||
| import vllm # type:ignore | ||
| from transformers import AutoTokenizer, PreTrainedTokenizerBase | ||
|
|
@@ -51,8 +49,6 @@ | |
| from mellea.stdlib.chat import Message | ||
| from mellea.stdlib.requirement import LLMaJRequirement, Requirement | ||
|
|
||
| assert outlines, "outlines needs to be present to make outlines_core work" | ||
|
|
||
| format: None = None # typing this variable in order to shadow the global format function and ensure mypy checks for errors | ||
|
|
||
|
|
||
|
|
@@ -84,14 +80,6 @@ def __init__( | |
| formatter (Formatter): A mechanism for turning `stdlib` stuff into strings. Experimental Span-based models should use `mellea.backends.span.*` backends. | ||
| model_options (Optional[dict]): Default model options. | ||
| """ | ||
| if os.environ.get("VLLM_USE_V1", -1) != "0": | ||
| FancyLogger.get_logger().error( | ||
| "Mellea LocalVLLMBackend doesn't support VLLM V1. Must `export VLLM_USE_V1=0`." | ||
| ) | ||
| raise ValueError( | ||
| "Mellea LocalVLLMBackend doesn't support VLLM V1. Must `export VLLM_USE_V1=0`." | ||
| ) | ||
|
|
||
| formatter = ( | ||
| formatter if formatter is not None else TemplateFormatter(model_id=model_id) | ||
| ) | ||
|
|
@@ -206,23 +194,20 @@ def __init__( | |
|
|
||
| # Keep track of the event loop the engine was instantiated in. | ||
| self._event_loop = get_current_event_loop() | ||
| # we store the engine args because we have to reset the engine with a different event loop. See _model . | ||
| self.engine_args = engine_args | ||
|
|
||
| self._tokenizer: PreTrainedTokenizerBase = AutoTokenizer.from_pretrained( | ||
| self._hf_model_id | ||
| ) # type:ignore | ||
|
|
||
| # See the notes in outlines.models.vllm.adapt_tokenizer for why this is needed. | ||
| # Note: there is a module named outlines.models.vllm and a function named outlines.models.vllm.vllm . | ||
| # However, outlines.models import outlines.models.vllm.vllm as vllm, | ||
| # thus the module outlines.models.vllm becomes inaccessible, | ||
| # hence the use of importlib to get the module. | ||
| self._tokenizer_for_outlines: PreTrainedTokenizerBase = importlib.import_module( | ||
| "outlines.models.vllm" | ||
| ).adapt_tokenizer(self._tokenizer) | ||
|
|
||
| @property | ||
| def _model(self) -> vllm.AsyncLLMEngine: | ||
| """Use model when making generation requests.""" | ||
| # 2026/01/06 Masa: Temporarily canceling the mechanism below. | ||
| # After vllm 0.11.0, start/shutdown_background_loop is gone. | ||
| # 2026/01/07 Masa: Rewrote it to reinstantiate the engine. | ||
|
|
||
| el = get_current_event_loop() | ||
|
|
||
| # vLLM attaches itself to the event loop that is running when instantiated / | ||
|
|
@@ -232,8 +217,13 @@ def _model(self) -> vllm.AsyncLLMEngine: | |
| # Most of the time, this should be a no-op. The event loop will only change | ||
| # if switching between async and sync calls. | ||
| if el != self._event_loop: | ||
| self._underlying_model.shutdown_background_loop() | ||
| self._underlying_model.start_background_loop() | ||
| FancyLogger.get_logger().warning("restarting the vllm event loop") | ||
| # self._underlying_model.shutdown_background_loop() | ||
| # self._underlying_model.start_background_loop() | ||
| self._underlying_model.shutdown() | ||
| self._underlying_model = vllm.AsyncLLMEngine.from_engine_args( | ||
| vllm.AsyncEngineArgs(model=self._hf_model_id, **self.engine_args) | ||
| ) | ||
| self._event_loop = el | ||
|
|
||
| return self._underlying_model | ||
|
|
@@ -321,22 +311,10 @@ async def _generate_from_context_standard( | |
| ) | ||
|
|
||
| if _format is not None: | ||
| # outlines.generate.json always parses the resulting json into a python dict. | ||
| # We however want to keep it as a json string for later storing it in ModelOutputThunk | ||
| schema: dict[str, Any] = _format.model_json_schema() # type: ignore | ||
| schema_json: str = json.dumps(schema) | ||
| regex_str: str = outlines_core.fsm.json_schema.build_regex_from_schema( # type: ignore | ||
| schema_json # type: ignore | ||
| ) # type: ignore | ||
|
|
||
| from outlines.processors import RegexLogitsProcessor | ||
|
|
||
| logits_processor = RegexLogitsProcessor( | ||
| regex_str, | ||
| tokenizer=self._tokenizer_for_outlines, # type: ignore | ||
| ) | ||
| sampling_params.logits_processors = ( | ||
| [logits_processor] if logits_processor is not None else [] | ||
| sampling_params.structured_outputs = ( | ||
| vllm.sampling_params.StructuredOutputsParams( | ||
| json=_format.model_json_schema() | ||
| ) | ||
| ) | ||
|
|
||
| # stream = model_options.get(ModelOption.STREAM, False) | ||
|
|
@@ -479,20 +457,10 @@ async def generate_from_raw( | |
| ) | ||
|
|
||
| if format is not None: | ||
| schema: dict[str, Any] = format.model_json_schema() # type: ignore | ||
| schema_json: str = json.dumps(schema) | ||
| regex_str: str = outlines_core.fsm.json_schema.build_regex_from_schema( # type: ignore | ||
| schema_json # type: ignore | ||
| ) # type: ignore | ||
|
|
||
| from outlines.processors import RegexLogitsProcessor | ||
|
|
||
| logits_processor = RegexLogitsProcessor( | ||
| regex_str, | ||
| tokenizer=self._tokenizer_for_outlines, # type: ignore | ||
| ) | ||
| sampling_params.logits_processors = ( | ||
| [logits_processor] if logits_processor is not None else [] | ||
| sampling_params.structured_outputs = ( | ||
| vllm.sampling_params.StructuredOutputsParams( | ||
| json=format.model_json_schema() | ||
| ) | ||
|
Comment on lines
+461
to
+463
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FWIW, when I was trying this locally ( Seems like what's happening is that the model is following the grammar but not valid JSON (which seems like may just be a fact of life with a tiny model, I see the vllm test is using qwen3 0.6b).
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A quick hack I tried locally that got the test to pass was to add an additional prompt to the list around using proper json content:
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could make this error message better, but I think what's happening is that the model runs out of tokens before the json completes (at least I see this error with hf sometimes):
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, that seems to be it. just re-ran the test and I see: and also though it's interesting, on my run looks like the string is an opening bracket and then a LOT of newlines... |
||
| ) | ||
|
|
||
| async def generate(prompt, request_id): | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.