Skip to content

API Reference

This page is generated from source code docstrings via mkdocstrings.

Package

gimkit

Core Modules

gimkit.guides

FormMixin

Source code in src/gimkit/guides.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
class FormMixin:
    def single_word(self, name: str | None = None) -> MaskedTag:
        """A single word without spaces."""
        return MaskedTag(name=name, desc=self.single_word.__doc__, regex=r"\S+")

    def select(self, name: str | None = None, choices: list[str] | None = None) -> MaskedTag:
        """Choose one from the given options."""
        if not choices:
            raise ValueError("choices must be a non-empty list of strings.")
        desc = f"Choose one from the following options: {', '.join(choices)}."
        regex = "|".join(re.escape(choice) for choice in choices)
        return MaskedTag(name=name, desc=desc, regex=regex)

    def datetime(
        self, name: str | None = None, require_date: bool = True, require_time: bool = True
    ) -> MaskedTag:
        """A date and/or time string, e.g., 2023-10-05, 14:30:00, 2023-10-05 14:30:00, etc."""
        date_regex = r"(?:\d{4}-\d{2}-\d{2})"  # YYYY-MM-DD
        time_regex = r"(?:\d{2}:\d{2}(?::\d{2})?)"  # HH:MM or HH:MM:SS

        if require_date and require_time:
            regex = rf"{date_regex}[ T]{time_regex}"
            desc = "A date and time in the format YYYY-MM-DD HH:MM[:SS]."
        elif require_date:
            regex = date_regex
            desc = "A date in the format YYYY-MM-DD."
        elif require_time:
            regex = time_regex
            desc = "A time in the format HH:MM[:SS]."
        else:
            raise ValueError("At least one of require_date or require_time must be True.")

        return MaskedTag(name=name, desc=desc, regex=regex)

datetime(name=None, require_date=True, require_time=True)

A date and/or time string, e.g., 2023-10-05, 14:30:00, 2023-10-05 14:30:00, etc.

Source code in src/gimkit/guides.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def datetime(
    self, name: str | None = None, require_date: bool = True, require_time: bool = True
) -> MaskedTag:
    """A date and/or time string, e.g., 2023-10-05, 14:30:00, 2023-10-05 14:30:00, etc."""
    date_regex = r"(?:\d{4}-\d{2}-\d{2})"  # YYYY-MM-DD
    time_regex = r"(?:\d{2}:\d{2}(?::\d{2})?)"  # HH:MM or HH:MM:SS

    if require_date and require_time:
        regex = rf"{date_regex}[ T]{time_regex}"
        desc = "A date and time in the format YYYY-MM-DD HH:MM[:SS]."
    elif require_date:
        regex = date_regex
        desc = "A date in the format YYYY-MM-DD."
    elif require_time:
        regex = time_regex
        desc = "A time in the format HH:MM[:SS]."
    else:
        raise ValueError("At least one of require_date or require_time must be True.")

    return MaskedTag(name=name, desc=desc, regex=regex)

select(name=None, choices=None)

Choose one from the given options.

Source code in src/gimkit/guides.py
22
23
24
25
26
27
28
def select(self, name: str | None = None, choices: list[str] | None = None) -> MaskedTag:
    """Choose one from the given options."""
    if not choices:
        raise ValueError("choices must be a non-empty list of strings.")
    desc = f"Choose one from the following options: {', '.join(choices)}."
    regex = "|".join(re.escape(choice) for choice in choices)
    return MaskedTag(name=name, desc=desc, regex=regex)

single_word(name=None)

A single word without spaces.

Source code in src/gimkit/guides.py
18
19
20
def single_word(self, name: str | None = None) -> MaskedTag:
    """A single word without spaces."""
    return MaskedTag(name=name, desc=self.single_word.__doc__, regex=r"\S+")

PersonalInfoMixin

Source code in src/gimkit/guides.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
class PersonalInfoMixin:
    def person_name(self, name: str | None = None) -> MaskedTag:
        """A person's name, e.g., John Doe, Alice, Bob, Charlie Brown, 张三, etc."""
        return MaskedTag(name=name, desc=self.person_name.__doc__)

    def phone_number(self, name: str | None = None) -> MaskedTag:
        """A phone number, e.g., +1-123-456-7890, (123) 456-7890, 123-456-7890, etc."""

        # Adapted from https://regexr.com/38pvb
        regex = (
            r"(?:\+?(\d{1,3}))?([-. (]*(\d{3})[-. )]*)?((\d{3})[-. ]*(\d{2,4})(?:[-.x ]*(\d+))?)"
        )
        return MaskedTag(name=name, desc=self.phone_number.__doc__, regex=regex)

    def e_mail(self, name: str | None = None) -> MaskedTag:
        """An email address, e.g., john.doe@example.com, alice@example.com, etc."""

        # Adapted from https://regexr.com/3a2i5
        regex = r"([\w\.]+)@([\w\.]+)\.(\w+)"
        return MaskedTag(name=name, desc=self.e_mail.__doc__, regex=regex)

e_mail(name=None)

An email address, e.g., john.doe@example.com, alice@example.com, etc.

Source code in src/gimkit/guides.py
66
67
68
69
70
71
def e_mail(self, name: str | None = None) -> MaskedTag:
    """An email address, e.g., john.doe@example.com, alice@example.com, etc."""

    # Adapted from https://regexr.com/3a2i5
    regex = r"([\w\.]+)@([\w\.]+)\.(\w+)"
    return MaskedTag(name=name, desc=self.e_mail.__doc__, regex=regex)

person_name(name=None)

A person's name, e.g., John Doe, Alice, Bob, Charlie Brown, 张三, etc.

Source code in src/gimkit/guides.py
53
54
55
def person_name(self, name: str | None = None) -> MaskedTag:
    """A person's name, e.g., John Doe, Alice, Bob, Charlie Brown, 张三, etc."""
    return MaskedTag(name=name, desc=self.person_name.__doc__)

phone_number(name=None)

A phone number, e.g., +1-123-456-7890, (123) 456-7890, 123-456-7890, etc.

Source code in src/gimkit/guides.py
57
58
59
60
61
62
63
64
def phone_number(self, name: str | None = None) -> MaskedTag:
    """A phone number, e.g., +1-123-456-7890, (123) 456-7890, 123-456-7890, etc."""

    # Adapted from https://regexr.com/38pvb
    regex = (
        r"(?:\+?(\d{1,3}))?([-. (]*(\d{3})[-. )]*)?((\d{3})[-. ]*(\d{2,4})(?:[-.x ]*(\d+))?)"
    )
    return MaskedTag(name=name, desc=self.phone_number.__doc__, regex=regex)

gimkit.schemas

Defines the schema for GIM.

MaskedTag dataclass

Represents a masked tag in the GIM schema.

A masked tag consists of three main types of components: 1. Tag ID: An integer identifier for the tag, represented as m_{id} in the tag attributes. 2. Tag content: The content located between the opening and closing masked tag markers. 3. Tag common attributes: All other tag attributes aside from the ID (e.g., name, desc, regex).

Example of a masked tag

<|MASKED id="m_0" name="xxx" desc="xxx" regex="xxx"|>content here<|/MASKED|>

Source code in src/gimkit/schemas.py
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
@dataclass
class MaskedTag:
    """Represents a masked tag in the GIM schema.

    A masked tag consists of three main types of components:
    1. **Tag ID**: An integer identifier for the tag, represented as `m_{id}` in the tag attributes.
    2. **Tag content**: The content located between the opening and closing masked tag markers.
    3. **Tag common attributes**: All other tag attributes aside from the ID (e.g., name, desc, regex).

    Example of a masked tag:
        `<|MASKED id="m_0" name="xxx" desc="xxx" regex="xxx"|>content here<|/MASKED|>`
    """

    id: int | str | None = None
    name: str | None = None
    desc: str | None = None
    regex: str | None = None
    content: str | None = None

    # Read-only class variable for additional attribute escapes. These
    # characters may appear in tag attributes such as `desc` or `grammar`.
    # Hexadecimal numeric character references are used for consistency and
    # compatibility with Python's built-in `html.escape` conventions.
    # Ref: https://www.w3.org/MarkUp/html-spec/html-spec_13.html
    _ADDITIONAL_ATTR_ESCAPES: ClassVar[Mapping[str, str]] = MappingProxyType(
        {
            "\t": "&#x09;",  # Tab
            "\n": "&#x0a;",  # Line Feed
            "\r": "&#x0d;",  # Carriage Return
        }
    )

    @classmethod
    def attr_escape(cls, text: str) -> str:
        escaped_text = html.escape(text, quote=True)
        for char, escape_seq in cls._ADDITIONAL_ATTR_ESCAPES.items():
            escaped_text = escaped_text.replace(char, escape_seq)
        return escaped_text

    @classmethod
    def attr_unescape(cls, text: str) -> str:
        return html.unescape(text)

    def __post_init__(self):
        # 1. Validate id
        if not (
            self.id is None
            or isinstance(self.id, int)
            or (isinstance(self.id, str) and self.id.isdigit())
        ):
            raise ValueError(f"{type(self.id)=}, {self.id=}, should be int, str of digits, or None")
        if isinstance(self.id, str):
            self.id = int(self.id)

        # 2. Validate common attributes
        for attr in COMMON_ATTRS:
            attr_val = getattr(self, attr)
            if isinstance(attr_val, str):
                setattr(self, attr, MaskedTag.attr_unescape(attr_val))
            elif attr_val is not None:
                raise ValueError(f"{type(attr_val)=}, {attr_val=}, should be str or None")

        # 3. Validate content
        if isinstance(self.content, str):
            # TAG_OPEN_RIGHT is common in text, so we allow it in content.
            # But other magic strings are not allowed.
            special_marks = [s for s in MAGIC_STRINGS if s != TAG_OPEN_RIGHT]
            if any(special_mark in self.content for special_mark in special_marks):
                raise ValueError(
                    "content should not contain special marks like "
                    + " or ".join(f"`{x}`" for x in special_marks)
                )
        elif self.content is not None:
            raise ValueError(f"{type(self.content)=}, {self.content=}, should be str or None")

        # 4. Validate regex if provided
        if isinstance(self.regex, str):
            if self.regex.startswith("^") or self.regex.endswith("$"):
                raise ValueError(
                    "regex should not start with ^ or end with $, "
                    "as it will be used within a larger regex pattern."
                )
            if self.regex.startswith("/") or self.regex.endswith("/"):
                raise ValueError(
                    "regex should not start or end with /, "
                    "as it will be wrapped with /.../ in CFG grammar."
                )
            if self.regex == "":
                raise ValueError("regex should not be an empty string.")
            try:
                re.compile(self.regex)
            except re.error as e:
                raise ValueError(f"Invalid regex pattern: {self.regex}") from e

    def to_string(
        self,
        fields: list[TagField] | Literal["all"] = "all",
    ) -> str:
        attr_part = ""
        if fields == "all":
            fields = cast("list[TagField]", list(ALL_FIELDS))
        if "id" in fields and self.id is not None:
            attr_part += f' id="m_{self.id}"'
        for attr in COMMON_ATTRS:
            if attr in fields and getattr(self, attr) is not None:
                escaped_val = self.attr_escape(getattr(self, attr))
                attr_part += f' {attr}="{escaped_val}"'
        content_part = ""
        if "content" in fields and self.content is not None:
            content_part = f"{self.content}"
        return TAG_OPEN_LEFT + attr_part + TAG_OPEN_RIGHT + content_part + TAG_END

    def __str__(self):
        return self.to_string()

    def __repr__(self):
        return self.to_string()

    def __add__(self, other: str) -> str:
        if isinstance(other, str):
            return str(self) + other
        return str(self) + str(other)

    def __radd__(self, other: str) -> str:
        if isinstance(other, str):
            return other + str(self)
        return str(other) + str(self)

parse_parts(s)

Parse a string into a list of ContextParts (str or MaskedTag).

Parameters:

Name Type Description Default
s str

The string to be parsed. Note it only contains masked tags or plain texts. Tag id may start from any non-negative integer, but must be in order 0, 1, 2, ...

required

Returns:

Type Description
list[ContextPart]

list[ContextPart]: A list of ContextParts (str or MaskedTag).

Source code in src/gimkit/schemas.py
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
def parse_parts(s: str) -> list[ContextPart]:
    """Parse a string into a list of ContextParts (str or MaskedTag).

    Args:
        s (str): The string to be parsed. Note it only contains masked tags or plain texts.
            Tag id may start from any non-negative integer, but must be in order 0, 1, 2, ...

    Returns:
        list[ContextPart]: A list of ContextParts (str or MaskedTag).
    """
    open_matches = list(TAG_OPEN_PATTERN.finditer(s))
    end_matches = list(TAG_END_PATTERN.finditer(s))
    full_matches = list(TAG_FULL_PATTERN.finditer(s))
    if not (len(open_matches) == len(end_matches) == len(full_matches)):
        raise InvalidFormatError(f"Mismatched or nested masked tags in {s}")

    parts: list[ContextPart] = []
    curr_tag_id = None
    last_end = 0
    for match in full_matches:
        start, end = match.span()
        if start > last_end:
            parts.append(s[last_end:start])

        fields = match.groupdict()
        tag_id = fields.get("id")
        if tag_id is not None:
            tag_id = int(tag_id)
            if curr_tag_id is None:
                curr_tag_id = tag_id
            elif tag_id != curr_tag_id:
                raise InvalidFormatError(
                    f"Tag ids should be in order, got {tag_id} at position {curr_tag_id}."
                )
        if curr_tag_id is not None:
            curr_tag_id += 1
        parts.append(MaskedTag(**fields))

        last_end = end
    if last_end < len(s):
        parts.append(s[last_end:])
    return parts

parse_tags(s, prefix=None, suffix=None)

Parse a string into a list of MaskedTags.

Parameters:

Name Type Description Default
s str

The string to be parsed. It may be wrapped with a prefix and suffix. Tag id may start from any non-negative integer, but must be in order 0, 1, 2, ...

required
prefix str | None

The prefix tag that the string should start with. Default is None.

None
suffix str | None

The suffix tag that the string should end with. Default is None.

None

Returns:

Type Description
list[MaskedTag]

list[MaskedTag]: A list of MaskedTags.

Source code in src/gimkit/schemas.py
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
def parse_tags(s: str, prefix: str | None = None, suffix: str | None = None) -> list[MaskedTag]:
    """Parse a string into a list of MaskedTags.

    Args:
        s (str): The string to be parsed. It may be wrapped with a prefix and suffix.
            Tag id may start from any non-negative integer, but must be in order 0, 1, 2, ...
        prefix (str | None): The prefix tag that the string should start with. Default is None.
        suffix (str | None): The suffix tag that the string should end with. Default is None.

    Returns:
        list[MaskedTag]: A list of MaskedTags.
    """

    if prefix is not None:
        s = s.lstrip()
        if not s.startswith(prefix):
            raise InvalidFormatError(f"String must start with the {prefix} tag.")

        s = s[len(prefix) :]
        if prefix in s:
            raise InvalidFormatError(f"Nested or duplicate {prefix} tag are not allowed.")

    if suffix is not None:
        s = s.rstrip()
        if not s.endswith(suffix):
            raise InvalidFormatError(f"String must end with the {suffix} tag.")

        s = s[: -len(suffix)]
        if suffix in s:
            raise InvalidFormatError(f"Nested or duplicate {suffix} tag are not allowed.")

    parts = parse_parts(s)
    tags = [part for part in parts if isinstance(part, MaskedTag)]

    if prefix is not None:
        expected_ids = list(range(len(tags)))
        actual_ids = [tag.id or idx for idx, tag in enumerate(tags)]
        if expected_ids != actual_ids:
            raise InvalidFormatError(
                f"Tag ids should be in order 0, 1, 2, ..., got {', '.join(map(str, actual_ids))}."
            )

    return tags

validate(query, response)

Validate the GIM query or/and GIM response.

Parameters:

Name Type Description Default
query str

Wrapped with query prefix and suffix.

required
response str

Wrapped with response prefix and suffix.

required

Raises:

Type Description
ValueError

If both query and response are None.

InvalidFormatError

If the format of query or response is invalid, or if the number of masked tags or their ids do not match between query and response.

Source code in src/gimkit/schemas.py
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
def validate(query: str | None, response: str | None):
    """Validate the GIM query or/and GIM response.

    Args:
        query (str): Wrapped with query prefix and suffix.
        response (str): Wrapped with response prefix and suffix.

    Raises:
        ValueError: If both query and response are None.
        InvalidFormatError: If the format of query or response is invalid,
            or if the number of masked tags or their ids do not match
            between query and response.
    """
    if query is None and response is None:
        raise ValueError("At least one of query or response must be provided.")
    if query is not None:
        query_tags = parse_tags(query, QUERY_PREFIX, QUERY_SUFFIX)
    if response is not None:
        response_tags = parse_tags(response, RESPONSE_PREFIX, RESPONSE_SUFFIX)
    if query is not None and response is not None and len(query_tags) != len(response_tags):
        raise InvalidFormatError("Mismatched number of masked tags between query and response.")

gimkit.contexts

Query

Bases: Context

Source code in src/gimkit/contexts.py
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
class Query(Context):
    def __init__(self, *args: ContextInput) -> None:
        super().__init__(QUERY_PREFIX, QUERY_SUFFIX, *args)

        # Validate and standardize the tags
        tag_count = 0
        tag_names = set()
        for part in self._parts:
            if isinstance(part, MaskedTag):
                if part.id is not None and part.id != tag_count:
                    raise InvalidFormatError("Tag ids must be sequential starting from 0.")
                part.id = tag_count
                tag_count += 1

                if part.name is not None:
                    if part.name in tag_names:
                        raise InvalidFormatError(f"Tag name '{part.name}' already exists.")
                    tag_names.add(part.name)

                if part.content == "":
                    part.content = None

    def infill(self, response: Response | ContextInput) -> Result:
        """Fills tags in this query (self) with content from the provided response."""
        return infill(self, response)

    def __str__(self) -> str:
        return self.to_string(fields=["id", "desc", "content"])

    def to_string_with_grammar(self) -> str:
        return self.to_string(fields=["id", "desc", "content", "regex"])

infill(response)

Fills tags in this query (self) with content from the provided response.

Source code in src/gimkit/contexts.py
200
201
202
def infill(self, response: Response | ContextInput) -> Result:
    """Fills tags in this query (self) with content from the provided response."""
    return infill(self, response)

Response

Bases: Context

Source code in src/gimkit/contexts.py
211
212
213
214
215
216
217
218
219
220
class Response(Context):
    def __init__(self, *args: ContextInput) -> None:
        super().__init__(RESPONSE_PREFIX, RESPONSE_SUFFIX, *args)

    def infill(self, query: Query | ContextInput) -> Result:
        """Fills the tags in the provided query with content from this response (self)."""
        return infill(query, self)

    def __str__(self) -> str:
        return self.to_string(fields=["id", "content"])

infill(query)

Fills the tags in the provided query with content from this response (self).

Source code in src/gimkit/contexts.py
215
216
217
def infill(self, query: Query | ContextInput) -> Result:
    """Fills the tags in the provided query with content from this response (self)."""
    return infill(query, self)

infill(query, response, strict=False)

Combines query and response by infilling missing content.

Parameters:

Name Type Description Default
query Query | ContextInput

The query containing masked tags to be filled

required
response Response | ContextInput

The response containing content to fill the tags

required
strict bool

If True, raises errors on format mismatches. If False, attempts to repair missing ending tags in a best-effort manner.

False

Returns:

Type Description
Result

A Result object with tags filled from the response

Raises:

Type Description
InvalidFormatError

If strict=True and there are format mismatches

Source code in src/gimkit/contexts.py
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
def infill(
    query: Query | ContextInput, response: Response | ContextInput, strict: bool = False
) -> Result:
    """Combines query and response by infilling missing content.

    Args:
        query: The query containing masked tags to be filled
        response: The response containing content to fill the tags
        strict: If True, raises errors on format mismatches. If False, attempts to repair
                missing ending tags in a best-effort manner.

    Returns:
        A Result object with tags filled from the response

    Raises:
        InvalidFormatError: If strict=True and there are format mismatches
    """
    if not isinstance(query, Query):
        query = Query(query)

    # When strict=False, try to repair missing endings before parsing
    if not strict and isinstance(response, str):
        response_str = response
        try:
            response = Response(response_str)
        except InvalidFormatError:
            # Try to repair missing endings
            repaired = _repair_missing_endings(response_str)
            if repaired != response_str:
                warnings.warn(
                    "Response has missing ending tags. Attempting automatic repair.",
                    stacklevel=2,
                )
                response = Response(repaired)
            else:
                raise
    elif not isinstance(response, Response):
        response = Response(response)

    query_tags = list(query.tags)
    response_tags = list(response.tags)
    if len(query_tags) != len(response_tags):
        msg = (
            "Mismatch in number of tags between query and response. "
            f"Query has {len(query_tags)} tag(s), response has {len(response_tags)} tag(s)."
        )
        if strict:
            raise InvalidFormatError(msg)
        else:
            warnings.warn(msg + " Will merge as many as possible.", stacklevel=2)

    result_parts: list[ContextPart] = []
    for part in query.parts[1:-1]:  # Exclude prefix and suffix
        if isinstance(part, MaskedTag) and query_tags and response_tags:
            q_tag = query_tags.pop(0)
            r_tag = response_tags.pop(0)
            part = MaskedTag(
                id=q_tag.id,
                name=q_tag.name,
                desc=q_tag.desc,
                regex=q_tag.regex,
                content=r_tag.content if r_tag.content is not None else q_tag.content,
            )
        result_parts.append(part)

    return Result(result_parts)

gimkit.dsls

Define DSL builders for various output types.

  • build_cfg constructs a context-free grammar (CFG) using LLGuidance syntax
  • build_json_schema constructs a JSON schema representing the response structure.

build_cfg(query)

Build an LLGuidance context-free grammar (CFG) string based on the query object.

Constructs a flattened grammar structure compatible with LLGuidance's suffix/capture logic.

Ref:
- https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md: Incomplete documentation of llguidance grammar syntax
- https://github.com/guidance-ai/guidance/blob/main/guidance/_ast.py: LarkSerializer implementation
- https://github.com/guidance-ai/llguidance: Source code

Real-World Example:
```python
query = '<|GIM_QUERY|>The capital of <|MASKED desc="single word" regex="中国|法国"|><|/MASKED|> is Beijing<|MASKED desc="punctuation mark" regex="\."|><|/MASKED|><|/GIM_QUERY|>'
print(repr(build_cfg(Query(query))))
>>> '%llguidance {}

start: "<|GIM_RESPONSE|>" REGEX "<|MASKED id=\"m_0\"|>" m_0 REGEX "<|MASKED id=\"m_1\"|>" m_1 REGEX "<|/GIM_RESPONSE|>" REGEX: /\s*/ m_0[capture, suffix="<|/MASKED|>"]: T_0 m_1[capture, suffix="<|/MASKED|>"]: T_1 T_0: /中国|法国/ T_1: /./ ' ```

Source code in src/gimkit/dsls.py
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
def build_cfg(query: Query) -> str:
    """Build an LLGuidance context-free grammar (CFG) string based on the query object.

    Constructs a flattened grammar structure compatible with LLGuidance's suffix/capture logic.

    Ref:
    - https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md: Incomplete documentation of llguidance grammar syntax
    - https://github.com/guidance-ai/guidance/blob/main/guidance/_ast.py: LarkSerializer implementation
    - https://github.com/guidance-ai/llguidance: Source code

    Real-World Example:
    ```python
    query = '<|GIM_QUERY|>The capital of <|MASKED desc="single word" regex="中国|法国"|><|/MASKED|> is Beijing<|MASKED desc="punctuation mark" regex="\\."|><|/MASKED|><|/GIM_QUERY|>'
    print(repr(build_cfg(Query(query))))
    >>> '%llguidance {}\nstart: "<|GIM_RESPONSE|>" REGEX "<|MASKED id=\\"m_0\\"|>" m_0 REGEX "<|MASKED id=\\"m_1\\"|>" m_1 REGEX "<|/GIM_RESPONSE|>"\nREGEX: /\\s*/\nm_0[capture, suffix="<|/MASKED|>"]: T_0\nm_1[capture, suffix="<|/MASKED|>"]: T_1\nT_0: /中国|法国/\nT_1: /\\./\n'
    ```
    """
    num_tags = len(query.tags)

    # 1. Header declaration
    lines = ["%llguidance {}"]

    # 2. Build start rule
    # Target format: start: "PREFIX" REGEX "OPEN_TAG_0" m_0 REGEX "OPEN_TAG_1" m_1 ... REGEX "SUFFIX"
    start_parts = [f'"{RESPONSE_PREFIX}"']

    for i in range(num_tags):
        # Add whitespace rule reference
        start_parts.append("REGEX")

        # Add opening tag literal, e.g.: "<|MASKED id=\"m_0\"|>"
        # Note escaping: id=\"m_{i}\"
        open_tag_str = f'"{TAG_OPEN_LEFT} id=\\"m_{i}\\"{TAG_OPEN_RIGHT}"'
        start_parts.append(open_tag_str)

        # Add content rule reference (lowercase m_i)
        start_parts.append(f"m_{i}")

    # Add trailing whitespace and suffix
    start_parts.append("REGEX")
    start_parts.append(f'"{RESPONSE_SUFFIX}"')

    lines.append(f"start: {' '.join(start_parts)}")

    # 3. Define whitespace rule (named REGEX to match examples, usually can also be called WS)
    lines.append(r"REGEX: /\s*/")

    # 4. Collect unique patterns and create a mapping for terminal reuse
    # This optimization avoids creating duplicate terminal rules for tags with the same regex
    unique_pattern_terminals: dict[str, str] = {}
    terminal_definitions: list[str] = []

    for i, tag in enumerate(query.tags):
        # Note: When used with suffix, using greedy match /(?s:.*)/ instead of /(?s:.)*?/ is correct and legal.
        pattern = f"/{tag.regex}/" if tag.regex else "/(?s:.*)/"

        # Get or create a shared terminal for this pattern
        if pattern not in unique_pattern_terminals:
            # Create a new terminal name for this unique pattern
            terminal_name = f"T_{len(unique_pattern_terminals)}"
            unique_pattern_terminals[pattern] = terminal_name
            terminal_definitions.append(f"{terminal_name}: {pattern}")

        terminal_name = unique_pattern_terminals[pattern]

        # Rule m_i (logical layer):
        # - capture: tells the engine to capture this part.
        # - suffix: specifies the ending tag, the engine stops and consumes it when encountered.
        # Note: Here we reference the TAG_END constant (i.e., "<|/MASKED|>")
        lines.append(f'm_{i}[capture, suffix="{TAG_END}"]: {terminal_name}')

    # 5. Add all unique terminal definitions
    lines.extend(terminal_definitions)

    # 6. Assemble final string
    grammar = "\n".join(lines) + "\n"

    is_error, msgs = validate_grammar_spec(get_grammar_spec(grammar))
    if is_error:
        raise ValueError(
            "Invalid CFG grammar constructed from the query object:\n"
            + "\n".join(msgs)
            + "\nWe recommend checking the syntax documentation at https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md"
        )
    return grammar

build_json_schema(query)

Build a JSON schema dictionary based on the query object.

The JSON schema represents the response structure where each masked tag becomes a field in the JSON object. The field name is "m_{id}" to match the tag id, and patterns are applied when regex is specified.

Source code in src/gimkit/dsls.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
def build_json_schema(query: Query) -> dict:
    """Build a JSON schema dictionary based on the query object.

    The JSON schema represents the response structure where each masked tag
    becomes a field in the JSON object. The field name is "m_{id}" to match
    the tag id, and patterns are applied when regex is specified.
    """
    properties = {}
    required_fields = []

    for tag in query.tags:
        field_name = f"m_{tag.id}"
        field_schema = {"type": "string"}

        # Add regex pattern if specified
        if tag.regex is not None:
            field_schema["pattern"] = f"^({tag.regex})$"

        # Add description if available
        if tag.desc is not None:
            field_schema["description"] = tag.desc

        properties[field_name] = field_schema
        required_fields.append(field_name)

    schema = {
        "type": "object",
        "properties": properties,
        "required": required_fields,
        "additionalProperties": False,
    }

    return schema

gimkit.prompts

For models that weren't trained using the Guided Infilling Modeling method, we may be able to achieve functionality similar to GIM by using a system prompt and few-shot prompting.

gimkit.log

gimkit.exceptions

GIMError

Bases: Exception

Base exception class for GIM-related errors.

Source code in src/gimkit/exceptions.py
1
2
class GIMError(Exception):
    """Base exception class for GIM-related errors."""

InvalidFormatError

Bases: GIMError

Exception raised for invalid GIM query/response format.

Source code in src/gimkit/exceptions.py
5
6
class InvalidFormatError(GIMError):
    """Exception raised for invalid GIM query/response format."""

Model Backends

gimkit.models.base

gimkit.models.openai

gimkit.models.vllm

gimkit.models.vllm_offline

gimkit.models.utils

get_outlines_model_input(model_input, output_type, use_gim_prompt, include_grammar=False, force_chat_input=False)

Transform the model input to an Outlines-compatible format.

Source code in src/gimkit/models/utils.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def get_outlines_model_input(
    model_input: ContextInput | Query,
    output_type: Literal["cfg", "json"] | None,
    use_gim_prompt: bool,
    include_grammar: bool = False,
    force_chat_input: bool = False,
) -> str | Chat:
    """Transform the model input to an Outlines-compatible format."""
    query_obj = Query(model_input) if not isinstance(model_input, Query) else model_input
    outlines_model_input: str | Chat = (
        query_obj.to_string_with_grammar() if include_grammar else str(query_obj)
    )

    if use_gim_prompt:
        # Use JSON-specific prompts when output_type is "json"
        if output_type == "json":
            system_prompt = SYSTEM_PROMPT_MSG_JSON
            demo_msgs = DEMO_CONVERSATION_MSGS_JSON
        else:
            system_prompt = SYSTEM_PROMPT_MSG
            demo_msgs = DEMO_CONVERSATION_MSGS
        outlines_model_input = Chat(
            [
                system_prompt,
                *demo_msgs,
                {"role": "user", "content": outlines_model_input},
            ]
        )

    if force_chat_input and isinstance(outlines_model_input, str):
        outlines_model_input = Chat([{"role": "user", "content": outlines_model_input}])

    return outlines_model_input

get_outlines_output_type(model_input, output_type)

Transform the output type to an Outlines-compatible format.

Source code in src/gimkit/models/utils.py
52
53
54
55
56
57
58
59
60
61
62
63
64
def get_outlines_output_type(
    model_input: ContextInput | Query, output_type: Literal["cfg", "json"] | None
) -> None | CFG | JsonSchema:
    """Transform the output type to an Outlines-compatible format."""
    query_obj = Query(model_input) if not isinstance(model_input, Query) else model_input
    if output_type is None:
        return None
    elif output_type == "cfg":
        return CFG(build_cfg(query_obj))
    elif output_type == "json":
        return JsonSchema(build_json_schema(query_obj))
    else:
        raise ValueError(f"Invalid output type: {output_type}")

infill_responses(query, responses, json_responses=False)

infill_responses(query: ContextInput | Query, responses: str, json_responses: bool = False) -> Result
infill_responses(query: ContextInput | Query, responses: list[str], json_responses: bool = False) -> list[Result]

Infill the provided query with content from the GIM responses or JSON responses.

Source code in src/gimkit/models/utils.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
def infill_responses(
    query: ContextInput | Query, responses: str | list[str], json_responses: bool = False
) -> Result | list[Result]:
    """Infill the provided query with content from the GIM responses or JSON responses."""
    # Handle single string response
    if isinstance(responses, str):
        if json_responses:
            responses = json_responses_to_gim_response(responses)
        return infill(query, responses)

    # Handle list of responses
    if not isinstance(responses, list):
        raise TypeError(f"Expected responses to be str or list of str, got {type(responses)}")

    if len(responses) == 0:
        raise ValueError("Response list is empty.")

    if not all(isinstance(resp, str) for resp in responses):
        raise TypeError(f"All items in the response list must be strings, got: {responses}")

    return [infill_responses(query, resp, json_responses=json_responses) for resp in responses]

json_responses_to_gim_response(json_response)

Convert a JSON response string to a GIM response string.

Parameters:

Name Type Description Default
json_response str

A JSON string representing the response.

required

Returns:

Type Description
str

A properly formatted GIM response string.

Raises:

Type Description
ValueError

If any key does not follow the "m_X" format where X is an integer.

Source code in src/gimkit/models/utils.py
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
def json_responses_to_gim_response(json_response: str) -> str:
    """Convert a JSON response string to a GIM response string.

    Args:
        json_response: A JSON string representing the response.

    Returns:
        A properly formatted GIM response string.

    Raises:
        ValueError: If any key does not follow the "m_X" format where X is an integer.
    """
    import re

    import json_repair

    from gimkit.log import get_logger

    logger = get_logger(__name__)

    result = json_repair.loads(json_response, logging=True)
    # When logging=True, json_repair.loads returns a tuple (json_obj, repair_log)
    if isinstance(result, tuple):
        json_obj, repair_log = result
        if repair_log:
            logger.warning(
                "JSON response required repair. Original: %s, Repair actions: %s",
                json_response,
                repair_log,
            )
    else:  # pragma: no cover
        # This shouldn't happen when logging=True, but handle gracefully
        json_obj = result  # type: ignore[assignment]
    if not isinstance(json_obj, dict):
        raise ValueError(f"Expected JSON response to be a dictionary, got {type(json_obj)}")

    validated_items = []
    for field_name, content in json_obj.items():
        match_result = re.fullmatch(r"m_(\d+)", field_name)
        if not match_result:
            raise ValueError(
                f"Invalid field name in JSON response: {field_name}. Expected format 'm_X' where X is an integer."
            )
        tag_id = int(match_result.group(1))
        validated_items.append((tag_id, content))

    validated_items.sort(key=lambda x: x[0])
    return str(
        Response([MaskedTag(id=tag_id, content=content) for tag_id, content in validated_items])
    )