mtmd: add granite-speech support (ibm-granite/granite-4.0-1b-speech) (#22101)

* mtmd: add granite-speech support (ibm-granite/granite-4.0-1b-speech)

Conformer encoder with Shaw relative position encoding,
QFormer projector, log-mel spectrogram with frame stacking.

Encoder uses GLU gating, folded batch norm, and SSM depthwise
conv. QFormer compresses encoder output via windowed
cross-attention (window=15, queries=3) into the LLM embedding
space.

Audio preprocessing: reflect-padded STFT, 80-bin mel filterbank,
dynamic range compression, 2x frame stacking (80->160 mel).

GGUF converter handles batch norm folding at export time,
fused K/V split, and Conv1d weight reshaping.

Tested against HF transformers reference: token-for-token match
on 30s/60s audio clips with greedy decoding.

* mtmd: rename gs_ prefixed tensors to generic/architecture names

* mtmd: use tensor_mapping.py for all granite_speech tensors

* convert: fold GraniteSpeechTextModel into GraniteModel

* mtmd: replace n_layer hack with explicit has_standard_layers flag

* mtmd: replace hardcoded magic numbers with GGUF hparams for granite speech

* mtmd: align KEY_A_ define spacing

* convert: register GraniteModel for GraniteSpeechForConditionalGeneration

* convert: fix ty type-check for GraniteSpeechMmprojModel registration

* mtmd: align TN_ define spacing

* mtmd: use generic layer loop for granite speech tensor loading

* mtmd: merge qformer_proj_layer into clip_layer

* mtmd: granite_speech remove redundant ggml_build_forward_expand on inputs

* mtmd: granite_speech add comment explaining why build_attn is not used

* mtmd: granite_speech hard-code eps in cpp, remove from GGUF metadata

* gguf: add spacing between granite_speech tensor mapping blocks

* mtmd: make generic audio layer_norm_eps read optional

* mtmd: granite_speech keep encoder eps in GGUF, only hard-code projector eps

* mtmd: align defines and struct fields in clip-impl.h and clip-model.h

* mtmd: fix alignment and ordering issues across granite speech files

* convert: granite_speech use filter_tensors instead of modify_tensors for skipping
This commit is contained in:
Yakine Tahtah
2026-05-06 14:40:59 +02:00
committed by GitHub
parent 750141969c
commit a00e47e422
13 changed files with 870 additions and 9 deletions
+67
View File
@@ -339,6 +339,9 @@ class Keys:
FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length"
PROJECTION_DIM = "clip.audio.projection_dim"
BLOCK_COUNT = "clip.audio.block_count"
CHUNK_SIZE = "clip.audio.chunk_size"
CONV_KERNEL_SIZE = "clip.audio.conv_kernel_size"
MAX_POS_EMB = "clip.audio.max_pos_emb"
class Attention:
HEAD_COUNT = "clip.audio.attention.head_count"
@@ -346,6 +349,9 @@ class Keys:
class Projector:
STACK_FACTOR = "clip.audio.projector.stack_factor"
WINDOW_SIZE = "clip.audio.projector.window_size"
DOWNSAMPLE_RATE = "clip.audio.projector.downsample_rate"
HEAD_COUNT = "clip.audio.projector.head_count"
class Diffusion:
SHIFT_LOGITS = "diffusion.shift_logits"
@@ -854,6 +860,26 @@ class MODEL_TENSOR(IntEnum):
A_ENC_CONV_NORM = auto() # SSM conv
A_ENC_CONV_PW1 = auto()
A_ENC_CONV_PW2 = auto()
A_CTC_OUT = auto()
A_CTC_OUT_MID = auto()
A_ENC_ATTN_REL_POS_EMB = auto()
# qformer projector
A_QF_PROJ_QUERY = auto()
A_QF_PROJ_NORM = auto()
A_QF_PROJ_LINEAR = auto()
A_QF_SELF_ATTN_Q = auto()
A_QF_SELF_ATTN_K = auto()
A_QF_SELF_ATTN_V = auto()
A_QF_SELF_ATTN_O = auto()
A_QF_SELF_ATTN_NORM = auto()
A_QF_CROSS_ATTN_Q = auto()
A_QF_CROSS_ATTN_K = auto()
A_QF_CROSS_ATTN_V = auto()
A_QF_CROSS_ATTN_O = auto()
A_QF_CROSS_ATTN_NORM = auto()
A_QF_FFN_UP = auto()
A_QF_FFN_DOWN = auto()
A_QF_FFN_NORM = auto()
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -1333,6 +1359,26 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.A_ENC_CONV_NORM: "a.blk.{bid}.conv_norm",
MODEL_TENSOR.A_ENC_CONV_PW1: "a.blk.{bid}.conv_pw1",
MODEL_TENSOR.A_ENC_CONV_PW2: "a.blk.{bid}.conv_pw2",
MODEL_TENSOR.A_CTC_OUT: "a.enc_ctc_out",
MODEL_TENSOR.A_CTC_OUT_MID: "a.enc_ctc_out_mid",
MODEL_TENSOR.A_ENC_ATTN_REL_POS_EMB: "a.blk.{bid}.attn_rel_pos_emb",
# qformer projector
MODEL_TENSOR.A_QF_PROJ_QUERY: "a.proj_query",
MODEL_TENSOR.A_QF_PROJ_NORM: "a.proj_norm",
MODEL_TENSOR.A_QF_PROJ_LINEAR: "a.proj_linear",
MODEL_TENSOR.A_QF_SELF_ATTN_Q: "a.proj_blk.{bid}.self_attn_q",
MODEL_TENSOR.A_QF_SELF_ATTN_K: "a.proj_blk.{bid}.self_attn_k",
MODEL_TENSOR.A_QF_SELF_ATTN_V: "a.proj_blk.{bid}.self_attn_v",
MODEL_TENSOR.A_QF_SELF_ATTN_O: "a.proj_blk.{bid}.self_attn_out",
MODEL_TENSOR.A_QF_SELF_ATTN_NORM: "a.proj_blk.{bid}.self_attn_norm",
MODEL_TENSOR.A_QF_CROSS_ATTN_Q: "a.proj_blk.{bid}.cross_attn_q",
MODEL_TENSOR.A_QF_CROSS_ATTN_K: "a.proj_blk.{bid}.cross_attn_k",
MODEL_TENSOR.A_QF_CROSS_ATTN_V: "a.proj_blk.{bid}.cross_attn_v",
MODEL_TENSOR.A_QF_CROSS_ATTN_O: "a.proj_blk.{bid}.cross_attn_out",
MODEL_TENSOR.A_QF_CROSS_ATTN_NORM: "a.proj_blk.{bid}.cross_attn_norm",
MODEL_TENSOR.A_QF_FFN_UP: "a.proj_blk.{bid}.ffn_up",
MODEL_TENSOR.A_QF_FFN_DOWN: "a.proj_blk.{bid}.ffn_down",
MODEL_TENSOR.A_QF_FFN_NORM: "a.proj_blk.{bid}.ffn_norm",
# NextN/MTP
MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.nextn.eh_proj",
MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.nextn.embed_tokens",
@@ -1480,6 +1526,26 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.A_MM_HARD_EMB_NORM,
MODEL_TENSOR.A_PER_DIM_K_SCALE,
MODEL_TENSOR.A_PER_DIM_SCALE,
MODEL_TENSOR.A_CTC_OUT,
MODEL_TENSOR.A_CTC_OUT_MID,
MODEL_TENSOR.A_ENC_ATTN_REL_POS_EMB,
# qformer projector
MODEL_TENSOR.A_QF_PROJ_QUERY,
MODEL_TENSOR.A_QF_PROJ_NORM,
MODEL_TENSOR.A_QF_PROJ_LINEAR,
MODEL_TENSOR.A_QF_SELF_ATTN_Q,
MODEL_TENSOR.A_QF_SELF_ATTN_K,
MODEL_TENSOR.A_QF_SELF_ATTN_V,
MODEL_TENSOR.A_QF_SELF_ATTN_O,
MODEL_TENSOR.A_QF_SELF_ATTN_NORM,
MODEL_TENSOR.A_QF_CROSS_ATTN_Q,
MODEL_TENSOR.A_QF_CROSS_ATTN_K,
MODEL_TENSOR.A_QF_CROSS_ATTN_V,
MODEL_TENSOR.A_QF_CROSS_ATTN_O,
MODEL_TENSOR.A_QF_CROSS_ATTN_NORM,
MODEL_TENSOR.A_QF_FFN_UP,
MODEL_TENSOR.A_QF_FFN_DOWN,
MODEL_TENSOR.A_QF_FFN_NORM,
],
MODEL_ARCH.LLAMA: [
MODEL_TENSOR.TOKEN_EMBD,
@@ -4158,6 +4224,7 @@ class VisionProjectorType:
NEMOTRON_V2_VL = "nemotron_v2_vl"
HUNYUANOCR = "hunyuanocr"
HUNYUANVL = "hunyuanvl"
GRANITE_SPEECH = "granite_speech" # audio
# Items here are (block size, type size)
+18
View File
@@ -1260,6 +1260,24 @@ class GGUFWriter:
def add_audio_stack_factor(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value)
def add_audio_chunk_size(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.CHUNK_SIZE, value)
def add_audio_conv_kernel_size(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.CONV_KERNEL_SIZE, value)
def add_audio_max_pos_emb(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.MAX_POS_EMB, value)
def add_audio_projector_window_size(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.Projector.WINDOW_SIZE, value)
def add_audio_projector_downsample_rate(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.Projector.DOWNSAMPLE_RATE, value)
def add_audio_projector_head_count(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.Projector.HEAD_COUNT, value)
def add_xielu_alpha_p(self, values: Sequence[float]):
self.add_array(Keys.xIELU.ALPHA_P, values)
+89
View File
@@ -155,6 +155,21 @@ class TensorNameMap:
MODEL_TENSOR.V_ENC_MSFA_NORM: (
"model.vision_tower.timm_model.msfa.norm", # gemma3n
),
MODEL_TENSOR.A_CTC_OUT: (
"encoder.out",
),
MODEL_TENSOR.A_CTC_OUT_MID: (
"encoder.out_mid",
),
MODEL_TENSOR.A_QF_PROJ_QUERY: (
"projector.query",
),
MODEL_TENSOR.A_QF_PROJ_NORM: (
"projector.qformer.layernorm",
),
MODEL_TENSOR.A_QF_PROJ_LINEAR: (
"projector.linear",
),
}
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
@@ -1881,6 +1896,7 @@ class TensorNameMap:
MODEL_TENSOR.A_ENC_INP_PROJ: (
"conformer.subsample_conv_projection.input_proj_linear", # gemma4
"encoder.input_linear",
),
MODEL_TENSOR.A_ENC_CONV2D: (
@@ -1903,6 +1919,7 @@ class TensorNameMap:
"conformer.layers.{bid}.self_attn.linear_q", # lfm2
"conformer.layers.{bid}.attention.attn.q_proj", # gemma3n
"conformer.layers.{bid}.self_attn.q_proj", # gemma4
"encoder.layers.{bid}.attn.to_q", # granite_speech
),
MODEL_TENSOR.A_ENC_ATTN_K: (
@@ -1910,6 +1927,7 @@ class TensorNameMap:
"conformer.layers.{bid}.self_attn.linear_k", # lfm2
"conformer.layers.{bid}.attention.attn.k_proj", # gemma3n
"conformer.layers.{bid}.self_attn.k_proj", # gemma4
"encoder.layers.{bid}.attn.to_k", # granite_speech (split from to_kv)
),
MODEL_TENSOR.A_ENC_ATTN_V: (
@@ -1917,6 +1935,7 @@ class TensorNameMap:
"conformer.layers.{bid}.self_attn.linear_v", # lfm2
"conformer.layers.{bid}.attention.attn.v_proj", # gemma3n
"conformer.layers.{bid}.self_attn.v_proj", # gemma4
"encoder.layers.{bid}.attn.to_v", # granite_speech (split from to_kv)
),
MODEL_TENSOR.A_ENC_ATTN_K_REL: (
@@ -1944,6 +1963,7 @@ class TensorNameMap:
"audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
"conformer.layers.{bid}.norm_self_att", # lfm2
"conformer.layers.{bid}.attention.pre_attn_norm", # gemma3n
"encoder.layers.{bid}.attn.pre_norm", # granite_speech
),
MODEL_TENSOR.A_ENC_OUTPUT: (
@@ -1951,18 +1971,21 @@ class TensorNameMap:
"conformer.layers.{bid}.self_attn.linear_out", # lfm2
"conformer.layers.{bid}.attention.post", # gemma3n
"conformer.layers.{bid}.self_attn.post", # gemma4
"encoder.layers.{bid}.attn.to_out", # granite_speech
),
MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
"audio_tower.layers.{bid}.final_layer_norm", # ultravox
"conformer.layers.{bid}.norm_out", # lfm2
"conformer.layers.{bid}.attention.post_norm", # gemma3n
"encoder.layers.{bid}.post_norm", # granite_speech
),
MODEL_TENSOR.A_ENC_FFN_NORM: (
"conformer.layers.{bid}.norm_feed_forward1", # lfm2
"conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n
"conformer.layers.{bid}.feed_forward1.pre_layer_norm", # gemma4
"encoder.layers.{bid}.ff1.pre_norm", # granite_speech
),
MODEL_TENSOR.A_ENC_FFN_POST_NORM: (
@@ -1979,6 +2002,7 @@ class TensorNameMap:
"conformer.layers.{bid}.feed_forward1.linear1", # lfm2
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n
"conformer.layers.{bid}.feed_forward1.ffw_layer_1", # gemma4
"encoder.layers.{bid}.ff1.up_proj", # granite_speech
),
MODEL_TENSOR.A_ENC_FFN_GATE: (),
@@ -1988,24 +2012,28 @@ class TensorNameMap:
"conformer.layers.{bid}.feed_forward1.linear2", # lfm2
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n
"conformer.layers.{bid}.feed_forward1.ffw_layer_2", # gemma4
"encoder.layers.{bid}.ff1.down_proj", # granite_speech
),
MODEL_TENSOR.A_ENC_FFN_UP_1: (
"conformer.layers.{bid}.feed_forward2.linear1", # lfm2
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n
"conformer.layers.{bid}.feed_forward2.ffw_layer_1", # gemma4
"encoder.layers.{bid}.ff2.up_proj", # granite_speech
),
MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
"conformer.layers.{bid}.feed_forward2.linear2", # lfm2
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n
"conformer.layers.{bid}.feed_forward2.ffw_layer_2", # gemma4
"encoder.layers.{bid}.ff2.down_proj", # granite_speech
),
MODEL_TENSOR.A_ENC_FFN_NORM_1: (
"conformer.layers.{bid}.norm_feed_forward2", # lfm2
"conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n
"conformer.layers.{bid}.feed_forward2.pre_layer_norm", # gemma4
"encoder.layers.{bid}.ff2.pre_norm", # granite_speech
),
MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: (
@@ -2062,26 +2090,31 @@ class TensorNameMap:
MODEL_TENSOR.A_ENC_CONV_DW: (
"conformer.layers.{bid}.conv.depthwise_conv", # lfm2
"conformer.layers.{bid}.lconv1d.depthwise_conv1d", # gemma3n
"encoder.layers.{bid}.conv.depth_conv.conv", # granite_speech
),
MODEL_TENSOR.A_ENC_CONV_NORM: (
"conformer.layers.{bid}.conv.batch_norm", # lfm2
"conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n
"encoder.layers.{bid}.conv.batch_norm", # granite_speech
),
MODEL_TENSOR.A_ENC_CONV_PW1: (
"conformer.layers.{bid}.conv.pointwise_conv1", # lfm2
"conformer.layers.{bid}.lconv1d.linear_start", # gemma3n
"encoder.layers.{bid}.conv.up_conv", # granite_speech
),
MODEL_TENSOR.A_ENC_CONV_PW2: (
"conformer.layers.{bid}.conv.pointwise_conv2", # lfm2
"conformer.layers.{bid}.lconv1d.linear_end", # gemma3n
"encoder.layers.{bid}.conv.down_conv", # granite_speech
),
MODEL_TENSOR.A_ENC_NORM_CONV: (
"conformer.layers.{bid}.norm_conv", # lfm2
"conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n
"encoder.layers.{bid}.conv.norm", # granite_speech
),
MODEL_TENSOR.A_PER_DIM_K_SCALE: (
@@ -2105,6 +2138,62 @@ class TensorNameMap:
"model.embed_audio.soft_embedding_norm", # gemma3n
),
MODEL_TENSOR.A_ENC_ATTN_REL_POS_EMB: (
"encoder.layers.{bid}.attn.rel_pos_emb.weight",
),
MODEL_TENSOR.A_QF_SELF_ATTN_Q: (
"projector.qformer.encoder.layer.{bid}.attention.attention.query",
),
MODEL_TENSOR.A_QF_SELF_ATTN_K: (
"projector.qformer.encoder.layer.{bid}.attention.attention.key",
),
MODEL_TENSOR.A_QF_SELF_ATTN_V: (
"projector.qformer.encoder.layer.{bid}.attention.attention.value",
),
MODEL_TENSOR.A_QF_SELF_ATTN_O: (
"projector.qformer.encoder.layer.{bid}.attention.output.dense",
),
MODEL_TENSOR.A_QF_SELF_ATTN_NORM: (
"projector.qformer.encoder.layer.{bid}.attention.output.LayerNorm",
),
MODEL_TENSOR.A_QF_CROSS_ATTN_Q: (
"projector.qformer.encoder.layer.{bid}.crossattention.attention.query",
),
MODEL_TENSOR.A_QF_CROSS_ATTN_K: (
"projector.qformer.encoder.layer.{bid}.crossattention.attention.key",
),
MODEL_TENSOR.A_QF_CROSS_ATTN_V: (
"projector.qformer.encoder.layer.{bid}.crossattention.attention.value",
),
MODEL_TENSOR.A_QF_CROSS_ATTN_O: (
"projector.qformer.encoder.layer.{bid}.crossattention.output.dense",
),
MODEL_TENSOR.A_QF_CROSS_ATTN_NORM: (
"projector.qformer.encoder.layer.{bid}.crossattention.output.LayerNorm",
),
MODEL_TENSOR.A_QF_FFN_UP: (
"projector.qformer.encoder.layer.{bid}.intermediate_query.dense",
),
MODEL_TENSOR.A_QF_FFN_DOWN: (
"projector.qformer.encoder.layer.{bid}.output_query.dense",
),
MODEL_TENSOR.A_QF_FFN_NORM: (
"projector.qformer.encoder.layer.{bid}.output_query.LayerNorm",
),
# NextN/MTP tensors
MODEL_TENSOR.NEXTN_EH_PROJ: (
"model.layers.{bid}.eh_proj",