import os
import re
from dataclasses import dataclass, field
import click
from linkml._version import __version__
from linkml.utils.generator import Generator, shared_arguments
from linkml_runtime.linkml_model.meta import ClassDefinition, EnumDefinition, SlotDefinition
from linkml_runtime.utils.formatutils import camelcase, lcamelcase
# ---------------------------------------------------------------------------
# LinkML built-in type -> proto3 scalar
# https://linkml.io/linkml-model/linkml_model/model/schema/types.yaml
# https://protobuf.dev/programming-guides/proto3/#scalar
# ---------------------------------------------------------------------------
# Proto scalar, keyed by LinkML types->base.
_PROTO_SCALAR_BY_LINKML_BASE: dict[str, str] = {
"str": "string", # str, jsonpointer, jsonpath, sqarqlpath
"int": "int32",
"Bool": "bool", # base: Bool
"bool": "bool", # repr: bool
"float": "float", # proto =IEEE 754 single-precision
"double": "double", # proto =IEEE 754 double-precision
"Decimal": "string", # linkml =Capitalized.
## Linkml - repr: str
"XSDTime": "string",
"XSDDate": "string",
"XSDDateTime": "string", # (datetime/date_or_datetime)
"URIorCURIE": "string",
"Curie": "string",
"URI": "string",
"NCName": "string",
"NodeIdentifier": "string",
"ElementIdentifier": "string",
"Bytes": "bytes",
# XSD-flavoured aliases, defensive
"XSDString": "string",
"XSDInteger": "int32",
"XSDBoolean": "bool",
"XSDFloat": "float",
"XSDDouble": "double",
}
# Name-keyed override for LinkML types whose ``base`` is ambiguous.
#
# LinkML's standard library defines both "float" and "double" with
# "base: float" (both Python floats), so a base-only lookup would
# collapse the proto-side distinction between 32-bit and 64-bit IEEE 754.
# So special-case the canonical type names here.
_PROTO_SCALAR_BY_LINKML_NAME: dict[str, str] = {
"double": "double",
}
# https://protobuf.dev/programming-guides/proto3/#assigning
# proto3 reserves field numbers 19000-19999 for internal use.
_RESERVED_FIELD_LO = 19000
_RESERVED_FIELD_HI = 19999
# Fallback when we can't resolve a slot range (e.g. unknown reference).
# "string" is safest choice - (see _PROTO_SCALAR_BY_LINKML_BASE)
_PROTO_DEFAULT_SCALAR = "string"
def _to_proto_ident(value: str) -> str:
"""Sanitise *value* into a proto3 identifier that follows Google's style guide.
See https://protobuf.dev/programming-guides/style/#identifier
The rule we enforce: identifiers must not start or end with an underscore,
and every underscore must be followed by a letter (never a digit or another
underscore). Concretely:
- Non-identifier characters become ``_``.
- Runs of underscores collapse to one (no ``__``).
- Any underscore immediately followed by a digit has an ``N`` inserted
(``foo_2bar`` becomes "foo_N2bar") since digits aren't allowed after
``_``. Inserting rather than dropping keeps ``foo_2bar`` distinct from
``foo2bar`` (reusing the same leading-digit prefix rule below).
- Leading and trailing underscores are stripped.
- An empty result, or one starting with a digit, is prefixed with ``N`` so
identifier matches ``[A-Za-z][A-Za-z0-9_]*`` and never starts with ``_``.
"""
cleaned = re.sub(r"[^A-Za-z0-9_]", "_", (value or "").strip())
cleaned = re.sub(r"_+", "_", cleaned)
cleaned = re.sub(r"_(\d)", r"_N\1", cleaned)
cleaned = cleaned.strip("_")
if not cleaned:
return "N"
if cleaned[0].isdigit():
cleaned = "N" + cleaned
return cleaned
def _to_upper_snake(value: str) -> str:
"""Convert *value* to UPPER_SNAKE_CASE for use as a proto3 enum value name."""
# Break CamelCase into CAMEL_CASE before sanitising - this preserves word
# boundaries that would otherwise be merged together by _to_proto_ident.
s = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", value or "")
return _to_proto_ident(s).upper()
[docs]
@dataclass
class ProtoGenerator(Generator):
"""
A `Generator` for creating Protobuf schemas from a linkml schema.
"""
# ClassVars
generatorname = os.path.basename(__file__)
generatorversion = "0.2.0"
valid_formats = ["proto"]
visit_all_class_slots = True
uses_schemaloader = True
# ObjectVars
# Per-class map of slot name -> proto field number. Populated in visit_class
# so visit_class_slot can look up the pre-computed number without having to
# repeat the collision-avoidance logic for every slot.
_field_numbers: dict[str, int] = field(default_factory=dict, init=False, repr=False)
# ------------------------------------------------------------------ header
def visit_schema(self, **kwargs) -> str | None:
return self.generate_header()
@staticmethod
def _sanitise_proto_package(value: str | None) -> str | None:
"""Coerce *value* into a valid proto3 package identifier, or return None.
Proto3 package identifiers may be dot-separated; each segment must
match ``[A-Za-z_][A-Za-z0-9_]*``. We additionally follow the proto3
style guide: no leading/trailing underscores, no consecutive
underscores, and no underscore immediately followed by a digit. An
``_`` before a digit gets an ``N`` inserted (``foo_2bar`` becomes
"foo_n2bar") rather than dropped, so ``foo_2bar`` stays distinct from
``foo2bar``.
Returns None when no usable identifier can be derived - the caller
then omits the ``package`` line (it's optional in proto3).
"""
candidate = re.sub(r"[^A-Za-z0-9_.]", "_", (value or "").strip())
# Collapse runs of underscores; insert `N` after any `_` that precedes
# a digit (non-destructive, keeps `foo_2bar` distinct from `foo2bar`).
candidate = re.sub(r"_+", "_", candidate)
candidate = re.sub(r"_(\d)", r"_N\1", candidate)
candidate = candidate.strip("._").lower()
if not candidate or candidate[0].isdigit():
return None
return candidate
def _proto_package(self) -> str | None:
"""Derive a proto3 package name from the schema's ``name`` attribute.
The schema id is a URI and rarely converts cleanly to a proto
identifier, so we use "schema.name" as the source. If that name can't
be sanitised into a valid identifier, return None.
"""
return self._sanitise_proto_package(self.schema.name)
def generate_header(self) -> str:
# https://protobuf.dev/reference/protobuf/proto3-spec/#syntax
items = ['syntax = "proto3";']
# https://protobuf.dev/reference/protobuf/proto3-spec/#package
pkg = self._proto_package()
# 'package' is optional in proto3 - only emit when non-bare (valued).
if pkg:
items.append(f"package {pkg};")
items.append(f"// metamodel_version: {self.schema.metamodel_version}")
if self.schema.version:
items.append(f"// version: {self.schema.version}")
return "\n".join(items) + "\n"
# ------------------------------------------------ range / type resolution
def _proto_range(self, slot_range: str | None) -> str:
"""Resolve a slot range to a proto3 type reference.
Order of resolution:
1. LinkML type -> mapped proto3 scalar (via its ``base``).
2. Schema class -> CamelCase message reference.
3. Schema enum -> CamelCase enum reference.
4. Unknown / missing -> ``string`` (safe fallback).
References to messages and enums must be CamelCase to match the
declared names - earlier versions of this generator used lcamelcase
here, producing references that didn't resolve.
"""
if not slot_range:
return _PROTO_DEFAULT_SCALAR
if slot_range in self.schema.types:
return self._proto_scalar_for_type(slot_range)
if slot_range in self.schema.classes or slot_range in self.schema.enums:
return camelcase(slot_range)
return _PROTO_DEFAULT_SCALAR
def _proto_scalar_for_type(self, type_name: str) -> str:
"""Map a LinkML type to a proto3 scalar by walking its ``typeof`` chain.
Some schemas define derived types like ``age_in_years_type`` with
``typeof: integer``. We resolve with the following precedence:
1. **Name override anywhere in the chain.** Handles e.g. ``double``,
whose ``base`` ambiguously equals ``float`` in LinkML's stdlib —
and which the schema-loader propagates down to derived types like
``Coordinate64: typeof: double``, so the name match needs to win
regardless of where in the chain it appears.
2. **Base mapping**, taking the first match while walking up.
3. Otherwise, fall back to ``string``.
"""
# Pass 1: name override anywhere in the typeof chain.
seen: set[str] = set()
current: str | None = type_name
while current and current in self.schema.types and current not in seen:
seen.add(current)
if current in _PROTO_SCALAR_BY_LINKML_NAME:
return _PROTO_SCALAR_BY_LINKML_NAME[current]
current = self.schema.types[current].typeof
# Pass 2: base lookup walking the chain top-down.
seen.clear()
current = type_name
while current and current in self.schema.types and current not in seen:
seen.add(current)
t = self.schema.types[current]
if t.base and t.base in _PROTO_SCALAR_BY_LINKML_BASE:
return _PROTO_SCALAR_BY_LINKML_BASE[t.base]
current = t.typeof
return _PROTO_DEFAULT_SCALAR
# ----------------------------------------------- field number assignment
@staticmethod
def _next_field_number(n: int, used: set[int]) -> int:
"""Return the next available field number >= *n*.
Skips numbers already claimed, and reserved range 19000-19999.
"""
while n in used or _RESERVED_FIELD_LO <= n <= _RESERVED_FIELD_HI:
n += 1
return n
# ---------------------------------------------- class & slot emission
def visit_class(self, cls: ClassDefinition) -> str | None:
# Every class is emitted as a proto3 message - including mixins,
# abstracts, and slot-less concrete classes:
# https://protobuf.dev/reference/protobuf/proto3-spec/#message_definition
# - proto3 has no concept of "mixin" or "abstract"; the only way to
# keep a reference to such a class valid is to declare it.
# - Slot-less classes become empty messages (`message X {}`), which
# proto3 allows. https://protobuf.dev/reference/protobuf/google.protobuf/#empty
# Otherwise, any field whose range is such a class would point at a non-existent
# type and protoc would fail.
# The trade-off is that LinkML's mixin/abstract semantics are not
# carried over to proto3, but the resulting file is always compilable and every
# range reference resolves to a declared message.
# Pre-compute proto field numbers for every slot in this class.
#
# Pre-pass: proto3 forbids field number 0, requires uniqueness within a
# message, and reserves 19000-19999. Honor LinkML's `rank` slot (allow
# pinning numbers for wire compatibility) and auto-assign the rest (starting
# at 1), skipping numbers already claimed by rank, and reserved range.
# Doing this once up front keeps visit_class_slot a simple lookup.
used_ranks = {s.rank for s in self.all_slots(cls) if s.rank}
next_auto = 1
self._field_numbers = {}
for slot in self.all_slots(cls):
if slot.rank:
self._field_numbers[slot.name] = slot.rank
else:
next_auto = self._next_field_number(next_auto, used_ranks)
self._field_numbers[slot.name] = next_auto
next_auto += 1
items = []
if cls.description:
for dline in cls.description.split("\n"):
items.append(f"// {dline}")
items.append(f"message {camelcase(cls.name)} {{")
return "\n".join(items)
def end_class(self, cls: ClassDefinition) -> str:
return "\n}\n"
def visit_class_slot(self, cls: ClassDefinition, aliased_slot_name: str, slot: SlotDefinition) -> str:
qual = "repeated " if slot.multivalued else ""
slotname = lcamelcase(aliased_slot_name)
slot_range = self._proto_range(slot.range)
# Every proto3 field statement must end with `;` - without it `protoc`
# rejects the file. The field number comes from the pre-computed map
# built in visit_class so we never emit forbidden ""= 0".
return f"\n {qual}{slot_range} {slotname} = {self._field_numbers[slot.name]};"
# ------------------------------------------------------- enum emission
def end_schema(self, **kwargs) -> str | None:
"""Emit an ``enum { ... }`` block for every LinkML enum.
https://protobuf.dev/programming-guides/proto3/#enum
https://protobuf.dev/programming-guides/proto3/#enum-default
proto3 enum constraints honoured here:
- The first enum declared value must be numeric "0" and should have
. the name ENUM_TYPE_NAME_UNSPECIFIED or ENUM_TYPE_NAME_UNKNOWN.
we can use 0 as a numeric default value, for proto2 compatibility.
So real permissible values start at "1", preserving semantics.
- All enum value names share a namespace at the enclosing scope.
To avoid collisions across multiple enums in the same proto file
we prefix every value with the enum name in UPPER_SNAKE_CASE
(e.g. ``FAMILIAL_RELATIONSHIP_TYPE_SIBLING_OF``).
- All identifiers must match ``[A-Za-z_][A-Za-z0-9_]*``; permissible
values with spaces or other punctuation are sanitised.
Enums are emitted after the messages. Order is purely cosmetic.
"""
if not self.schema.enums:
return None
blocks: list[str] = []
for ename in sorted(self.schema.enums):
enum: EnumDefinition = self.schema.enums[ename]
proto_name = camelcase(ename)
value_prefix = _to_upper_snake(ename)
lines: list[str] = []
if enum.description:
for dline in enum.description.split("\n"):
lines.append(f"// {dline}")
lines.append(f"enum {proto_name} {{")
# Synthetic zero-value sentinel - added first so its identifier
# is also in `seen_values` for the dedupe loop below (in case a
# real permissible value happens to be named "UNSPECIFIED").
unspecified_ident = f"{value_prefix}_UNSPECIFIED"
lines.append(f" {unspecified_ident} = 0;")
seen_values: set[str] = {unspecified_ident}
# Real permissible values start at 1; 0 is reserved for the
# UNSPECIFIED sentinel emitted above.
for i, pv_name in enumerate(enum.permissible_values or {}, start=1):
value_ident = f"{value_prefix}_{_to_upper_snake(pv_name)}"
# Defensive dedupe - two permissible values that sanitise to
# the same identifier would otherwise produce a proto3 error.
# Suffix is `_V<n>` (not `_<n>`) so the underscore is followed
# by a letter, per the proto3 style guide.
base_ident = value_ident
suffix = 2
while value_ident in seen_values:
value_ident = f"{base_ident}_V{suffix}"
suffix += 1
seen_values.add(value_ident)
lines.append(f" {value_ident} = {i};")
lines.append("}")
blocks.append("\n".join(lines))
return "\n" + "\n".join(blocks) + "\n"
@shared_arguments(ProtoGenerator)
@click.version_option(__version__, "-V", "--version")
@click.command(name="proto")
def cli(yamlfile, **args):
"""Generate proto representation of LinkML model"""
print(ProtoGenerator(yamlfile, **args).serialize(**args))
if __name__ == "__main__":
cli()