mtmd: add granite-speech support (ibm-granite/granite-4.0-1b-speech) (#22101)
* mtmd: add granite-speech support (ibm-granite/granite-4.0-1b-speech) Conformer encoder with Shaw relative position encoding, QFormer projector, log-mel spectrogram with frame stacking. Encoder uses GLU gating, folded batch norm, and SSM depthwise conv. QFormer compresses encoder output via windowed cross-attention (window=15, queries=3) into the LLM embedding space. Audio preprocessing: reflect-padded STFT, 80-bin mel filterbank, dynamic range compression, 2x frame stacking (80->160 mel). GGUF converter handles batch norm folding at export time, fused K/V split, and Conv1d weight reshaping. Tested against HF transformers reference: token-for-token match on 30s/60s audio clips with greedy decoding. * mtmd: rename gs_ prefixed tensors to generic/architecture names * mtmd: use tensor_mapping.py for all granite_speech tensors * convert: fold GraniteSpeechTextModel into GraniteModel * mtmd: replace n_layer hack with explicit has_standard_layers flag * mtmd: replace hardcoded magic numbers with GGUF hparams for granite speech * mtmd: align KEY_A_ define spacing * convert: register GraniteModel for GraniteSpeechForConditionalGeneration * convert: fix ty type-check for GraniteSpeechMmprojModel registration * mtmd: align TN_ define spacing * mtmd: use generic layer loop for granite speech tensor loading * mtmd: merge qformer_proj_layer into clip_layer * mtmd: granite_speech remove redundant ggml_build_forward_expand on inputs * mtmd: granite_speech add comment explaining why build_attn is not used * mtmd: granite_speech hard-code eps in cpp, remove from GGUF metadata * gguf: add spacing between granite_speech tensor mapping blocks * mtmd: make generic audio layer_norm_eps read optional * mtmd: granite_speech keep encoder eps in GGUF, only hard-code projector eps * mtmd: align defines and struct fields in clip-impl.h and clip-model.h * mtmd: fix alignment and ordering issues across granite speech files * convert: granite_speech use filter_tensors instead of modify_tensors for skipping
This commit is contained in:
@@ -339,6 +339,9 @@ class Keys:
|
||||
FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length"
|
||||
PROJECTION_DIM = "clip.audio.projection_dim"
|
||||
BLOCK_COUNT = "clip.audio.block_count"
|
||||
CHUNK_SIZE = "clip.audio.chunk_size"
|
||||
CONV_KERNEL_SIZE = "clip.audio.conv_kernel_size"
|
||||
MAX_POS_EMB = "clip.audio.max_pos_emb"
|
||||
|
||||
class Attention:
|
||||
HEAD_COUNT = "clip.audio.attention.head_count"
|
||||
@@ -346,6 +349,9 @@ class Keys:
|
||||
|
||||
class Projector:
|
||||
STACK_FACTOR = "clip.audio.projector.stack_factor"
|
||||
WINDOW_SIZE = "clip.audio.projector.window_size"
|
||||
DOWNSAMPLE_RATE = "clip.audio.projector.downsample_rate"
|
||||
HEAD_COUNT = "clip.audio.projector.head_count"
|
||||
|
||||
class Diffusion:
|
||||
SHIFT_LOGITS = "diffusion.shift_logits"
|
||||
@@ -854,6 +860,26 @@ class MODEL_TENSOR(IntEnum):
|
||||
A_ENC_CONV_NORM = auto() # SSM conv
|
||||
A_ENC_CONV_PW1 = auto()
|
||||
A_ENC_CONV_PW2 = auto()
|
||||
A_CTC_OUT = auto()
|
||||
A_CTC_OUT_MID = auto()
|
||||
A_ENC_ATTN_REL_POS_EMB = auto()
|
||||
# qformer projector
|
||||
A_QF_PROJ_QUERY = auto()
|
||||
A_QF_PROJ_NORM = auto()
|
||||
A_QF_PROJ_LINEAR = auto()
|
||||
A_QF_SELF_ATTN_Q = auto()
|
||||
A_QF_SELF_ATTN_K = auto()
|
||||
A_QF_SELF_ATTN_V = auto()
|
||||
A_QF_SELF_ATTN_O = auto()
|
||||
A_QF_SELF_ATTN_NORM = auto()
|
||||
A_QF_CROSS_ATTN_Q = auto()
|
||||
A_QF_CROSS_ATTN_K = auto()
|
||||
A_QF_CROSS_ATTN_V = auto()
|
||||
A_QF_CROSS_ATTN_O = auto()
|
||||
A_QF_CROSS_ATTN_NORM = auto()
|
||||
A_QF_FFN_UP = auto()
|
||||
A_QF_FFN_DOWN = auto()
|
||||
A_QF_FFN_NORM = auto()
|
||||
|
||||
|
||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
@@ -1333,6 +1359,26 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.A_ENC_CONV_NORM: "a.blk.{bid}.conv_norm",
|
||||
MODEL_TENSOR.A_ENC_CONV_PW1: "a.blk.{bid}.conv_pw1",
|
||||
MODEL_TENSOR.A_ENC_CONV_PW2: "a.blk.{bid}.conv_pw2",
|
||||
MODEL_TENSOR.A_CTC_OUT: "a.enc_ctc_out",
|
||||
MODEL_TENSOR.A_CTC_OUT_MID: "a.enc_ctc_out_mid",
|
||||
MODEL_TENSOR.A_ENC_ATTN_REL_POS_EMB: "a.blk.{bid}.attn_rel_pos_emb",
|
||||
# qformer projector
|
||||
MODEL_TENSOR.A_QF_PROJ_QUERY: "a.proj_query",
|
||||
MODEL_TENSOR.A_QF_PROJ_NORM: "a.proj_norm",
|
||||
MODEL_TENSOR.A_QF_PROJ_LINEAR: "a.proj_linear",
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_Q: "a.proj_blk.{bid}.self_attn_q",
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_K: "a.proj_blk.{bid}.self_attn_k",
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_V: "a.proj_blk.{bid}.self_attn_v",
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_O: "a.proj_blk.{bid}.self_attn_out",
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_NORM: "a.proj_blk.{bid}.self_attn_norm",
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_Q: "a.proj_blk.{bid}.cross_attn_q",
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_K: "a.proj_blk.{bid}.cross_attn_k",
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_V: "a.proj_blk.{bid}.cross_attn_v",
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_O: "a.proj_blk.{bid}.cross_attn_out",
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_NORM: "a.proj_blk.{bid}.cross_attn_norm",
|
||||
MODEL_TENSOR.A_QF_FFN_UP: "a.proj_blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.A_QF_FFN_DOWN: "a.proj_blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.A_QF_FFN_NORM: "a.proj_blk.{bid}.ffn_norm",
|
||||
# NextN/MTP
|
||||
MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.nextn.eh_proj",
|
||||
MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.nextn.embed_tokens",
|
||||
@@ -1480,6 +1526,26 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.A_MM_HARD_EMB_NORM,
|
||||
MODEL_TENSOR.A_PER_DIM_K_SCALE,
|
||||
MODEL_TENSOR.A_PER_DIM_SCALE,
|
||||
MODEL_TENSOR.A_CTC_OUT,
|
||||
MODEL_TENSOR.A_CTC_OUT_MID,
|
||||
MODEL_TENSOR.A_ENC_ATTN_REL_POS_EMB,
|
||||
# qformer projector
|
||||
MODEL_TENSOR.A_QF_PROJ_QUERY,
|
||||
MODEL_TENSOR.A_QF_PROJ_NORM,
|
||||
MODEL_TENSOR.A_QF_PROJ_LINEAR,
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_Q,
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_K,
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_V,
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_O,
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_NORM,
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_Q,
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_K,
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_V,
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_O,
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_NORM,
|
||||
MODEL_TENSOR.A_QF_FFN_UP,
|
||||
MODEL_TENSOR.A_QF_FFN_DOWN,
|
||||
MODEL_TENSOR.A_QF_FFN_NORM,
|
||||
],
|
||||
MODEL_ARCH.LLAMA: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
@@ -4158,6 +4224,7 @@ class VisionProjectorType:
|
||||
NEMOTRON_V2_VL = "nemotron_v2_vl"
|
||||
HUNYUANOCR = "hunyuanocr"
|
||||
HUNYUANVL = "hunyuanvl"
|
||||
GRANITE_SPEECH = "granite_speech" # audio
|
||||
|
||||
|
||||
# Items here are (block size, type size)
|
||||
|
||||
@@ -1260,6 +1260,24 @@ class GGUFWriter:
|
||||
def add_audio_stack_factor(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value)
|
||||
|
||||
def add_audio_chunk_size(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.CHUNK_SIZE, value)
|
||||
|
||||
def add_audio_conv_kernel_size(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.CONV_KERNEL_SIZE, value)
|
||||
|
||||
def add_audio_max_pos_emb(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.MAX_POS_EMB, value)
|
||||
|
||||
def add_audio_projector_window_size(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.Projector.WINDOW_SIZE, value)
|
||||
|
||||
def add_audio_projector_downsample_rate(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.Projector.DOWNSAMPLE_RATE, value)
|
||||
|
||||
def add_audio_projector_head_count(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.Projector.HEAD_COUNT, value)
|
||||
|
||||
def add_xielu_alpha_p(self, values: Sequence[float]):
|
||||
self.add_array(Keys.xIELU.ALPHA_P, values)
|
||||
|
||||
|
||||
@@ -155,6 +155,21 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.V_ENC_MSFA_NORM: (
|
||||
"model.vision_tower.timm_model.msfa.norm", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.A_CTC_OUT: (
|
||||
"encoder.out",
|
||||
),
|
||||
MODEL_TENSOR.A_CTC_OUT_MID: (
|
||||
"encoder.out_mid",
|
||||
),
|
||||
MODEL_TENSOR.A_QF_PROJ_QUERY: (
|
||||
"projector.query",
|
||||
),
|
||||
MODEL_TENSOR.A_QF_PROJ_NORM: (
|
||||
"projector.qformer.layernorm",
|
||||
),
|
||||
MODEL_TENSOR.A_QF_PROJ_LINEAR: (
|
||||
"projector.linear",
|
||||
),
|
||||
}
|
||||
|
||||
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||
@@ -1881,6 +1896,7 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.A_ENC_INP_PROJ: (
|
||||
"conformer.subsample_conv_projection.input_proj_linear", # gemma4
|
||||
"encoder.input_linear",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV2D: (
|
||||
@@ -1903,6 +1919,7 @@ class TensorNameMap:
|
||||
"conformer.layers.{bid}.self_attn.linear_q", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.q_proj", # gemma3n
|
||||
"conformer.layers.{bid}.self_attn.q_proj", # gemma4
|
||||
"encoder.layers.{bid}.attn.to_q", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_K: (
|
||||
@@ -1910,6 +1927,7 @@ class TensorNameMap:
|
||||
"conformer.layers.{bid}.self_attn.linear_k", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.k_proj", # gemma3n
|
||||
"conformer.layers.{bid}.self_attn.k_proj", # gemma4
|
||||
"encoder.layers.{bid}.attn.to_k", # granite_speech (split from to_kv)
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_V: (
|
||||
@@ -1917,6 +1935,7 @@ class TensorNameMap:
|
||||
"conformer.layers.{bid}.self_attn.linear_v", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.v_proj", # gemma3n
|
||||
"conformer.layers.{bid}.self_attn.v_proj", # gemma4
|
||||
"encoder.layers.{bid}.attn.to_v", # granite_speech (split from to_kv)
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_K_REL: (
|
||||
@@ -1944,6 +1963,7 @@ class TensorNameMap:
|
||||
"audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
|
||||
"conformer.layers.{bid}.norm_self_att", # lfm2
|
||||
"conformer.layers.{bid}.attention.pre_attn_norm", # gemma3n
|
||||
"encoder.layers.{bid}.attn.pre_norm", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_OUTPUT: (
|
||||
@@ -1951,18 +1971,21 @@ class TensorNameMap:
|
||||
"conformer.layers.{bid}.self_attn.linear_out", # lfm2
|
||||
"conformer.layers.{bid}.attention.post", # gemma3n
|
||||
"conformer.layers.{bid}.self_attn.post", # gemma4
|
||||
"encoder.layers.{bid}.attn.to_out", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
|
||||
"audio_tower.layers.{bid}.final_layer_norm", # ultravox
|
||||
"conformer.layers.{bid}.norm_out", # lfm2
|
||||
"conformer.layers.{bid}.attention.post_norm", # gemma3n
|
||||
"encoder.layers.{bid}.post_norm", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_NORM: (
|
||||
"conformer.layers.{bid}.norm_feed_forward1", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward1.pre_layer_norm", # gemma4
|
||||
"encoder.layers.{bid}.ff1.pre_norm", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM: (
|
||||
@@ -1979,6 +2002,7 @@ class TensorNameMap:
|
||||
"conformer.layers.{bid}.feed_forward1.linear1", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward1.ffw_layer_1", # gemma4
|
||||
"encoder.layers.{bid}.ff1.up_proj", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_GATE: (),
|
||||
@@ -1988,24 +2012,28 @@ class TensorNameMap:
|
||||
"conformer.layers.{bid}.feed_forward1.linear2", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward1.ffw_layer_2", # gemma4
|
||||
"encoder.layers.{bid}.ff1.down_proj", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_UP_1: (
|
||||
"conformer.layers.{bid}.feed_forward2.linear1", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward2.ffw_layer_1", # gemma4
|
||||
"encoder.layers.{bid}.ff2.up_proj", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
|
||||
"conformer.layers.{bid}.feed_forward2.linear2", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward2.ffw_layer_2", # gemma4
|
||||
"encoder.layers.{bid}.ff2.down_proj", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_NORM_1: (
|
||||
"conformer.layers.{bid}.norm_feed_forward2", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward2.pre_layer_norm", # gemma4
|
||||
"encoder.layers.{bid}.ff2.pre_norm", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: (
|
||||
@@ -2062,26 +2090,31 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.A_ENC_CONV_DW: (
|
||||
"conformer.layers.{bid}.conv.depthwise_conv", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.depthwise_conv1d", # gemma3n
|
||||
"encoder.layers.{bid}.conv.depth_conv.conv", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV_NORM: (
|
||||
"conformer.layers.{bid}.conv.batch_norm", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n
|
||||
"encoder.layers.{bid}.conv.batch_norm", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV_PW1: (
|
||||
"conformer.layers.{bid}.conv.pointwise_conv1", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.linear_start", # gemma3n
|
||||
"encoder.layers.{bid}.conv.up_conv", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV_PW2: (
|
||||
"conformer.layers.{bid}.conv.pointwise_conv2", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.linear_end", # gemma3n
|
||||
"encoder.layers.{bid}.conv.down_conv", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_NORM_CONV: (
|
||||
"conformer.layers.{bid}.norm_conv", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n
|
||||
"encoder.layers.{bid}.conv.norm", # granite_speech
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_PER_DIM_K_SCALE: (
|
||||
@@ -2105,6 +2138,62 @@ class TensorNameMap:
|
||||
"model.embed_audio.soft_embedding_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_REL_POS_EMB: (
|
||||
"encoder.layers.{bid}.attn.rel_pos_emb.weight",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_Q: (
|
||||
"projector.qformer.encoder.layer.{bid}.attention.attention.query",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_K: (
|
||||
"projector.qformer.encoder.layer.{bid}.attention.attention.key",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_V: (
|
||||
"projector.qformer.encoder.layer.{bid}.attention.attention.value",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_O: (
|
||||
"projector.qformer.encoder.layer.{bid}.attention.output.dense",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_SELF_ATTN_NORM: (
|
||||
"projector.qformer.encoder.layer.{bid}.attention.output.LayerNorm",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_Q: (
|
||||
"projector.qformer.encoder.layer.{bid}.crossattention.attention.query",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_K: (
|
||||
"projector.qformer.encoder.layer.{bid}.crossattention.attention.key",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_V: (
|
||||
"projector.qformer.encoder.layer.{bid}.crossattention.attention.value",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_O: (
|
||||
"projector.qformer.encoder.layer.{bid}.crossattention.output.dense",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_CROSS_ATTN_NORM: (
|
||||
"projector.qformer.encoder.layer.{bid}.crossattention.output.LayerNorm",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_FFN_UP: (
|
||||
"projector.qformer.encoder.layer.{bid}.intermediate_query.dense",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_FFN_DOWN: (
|
||||
"projector.qformer.encoder.layer.{bid}.output_query.dense",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_QF_FFN_NORM: (
|
||||
"projector.qformer.encoder.layer.{bid}.output_query.LayerNorm",
|
||||
),
|
||||
|
||||
# NextN/MTP tensors
|
||||
MODEL_TENSOR.NEXTN_EH_PROJ: (
|
||||
"model.layers.{bid}.eh_proj",
|
||||
|
||||
Reference in New Issue
Block a user