Fixing untie to be used only as needed and automatic (#1963)

HDCharles · kylesayrs · web-flow · commit c254c193e777 · 2025-10-31T12:21:58.000-04:00
when models have shared in/out embedding, we had to call the untie
function to
separate them before modifying those layers. This caused increased
memory and was applied at all times
regardless of whether those layers were targeted for modification.

change: automatically detect when transform or mixin needs to untie
shared embeddings.

This also adds try except to the untieing code so that if it is invoked
on a model that can't be untied, it gives a warning rather than erroring

new tests are added to test this functionality, old tests are modified
to use the automatic untieing

the new tests were initially written using claude-code, I then rewrote
them

TEST PLAN:

pytest
tests/llmcompressor/modifiers/quantization/test_handling_shared_embeddings.py

---------

Signed-off-by: HDCharles &lt;charlesdavidhernandez@gmail.com&gt;
Co-authored-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/args/model_arguments.py b/src/llmcompressor/args/model_arguments.py
@@ -64,10 +64,11 @@ class ModelArguments:
     )
 
     tie_word_embeddings: bool = field(
-        default=False,
+        default=True,
         metadata={
             "help": "Whether the model's input and output word embeddings "
-            "should be tied. Note that this is only relevant if the "
+            "should attempt to be left tied. False means always untie."
+            " Note that this is only relevant if the "
             "model has a output word embedding layer."
         },
     )
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
@@ -233,7 +233,7 @@ def oneshot(
     processor: Optional[Union[str, ProcessorMixin]] = None,
     use_auth_token: bool = False,
     precision: str = "auto",
-    tie_word_embeddings: bool = False,
+    tie_word_embeddings: bool = True,
     trust_remote_code_model: bool = False,
     save_compressed: bool = True,
     model_revision: str = "main",
@@ -282,7 +282,7 @@ def oneshot(
         models.
     :param precision: Precision to cast model weights to, default to auto.
     :param tie_word_embeddings: Whether the model's input and output word embeddings
-        should be tied.
+        should be left tied if possible. False means always untie.
     :param trust_remote_code_model: Whether to allow for custom models to execute
         their own modeling files.
     :param save_compressed: Whether to compress sparse models during save.
diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
@@ -59,7 +59,6 @@ def pre_process(
     Raises:
         FileNotFoundError: If the model or processor path is invalid.
     """
-    _warn_tied_embeddings(model_args.tie_word_embeddings)
 
     # Initialize model
     if isinstance(model_args.model, (str, PosixPath)):
@@ -150,21 +149,6 @@ def post_process(
         reset_session()
 
 
-def _warn_tied_embeddings(tie_word_embeddings: bool = False):
-    """
-    Logs a warning if the model has tied word embeddings.
-    The `tie_word_embeddings` flag may cause issues during saving in the one-shot
-    calibration workflow due to shared tensor addresses.
-    """
-    if tie_word_embeddings:
-        logger.debug(
-            "The tie_word_embeddings flag is by default set to False. "
-            "This guarantees that the one-shot algorithm saves the final "
-            "weights without errors. Detected tie_word_embeddings=True. "
-            "This may cause issues with the one-shot algorithm on save."
-        )
-
-
 def initialize_model_from_path(
     model_args: ModelArguments,
     training_args: Optional[TrainingArguments] = None,
diff --git a/src/llmcompressor/modifiers/quantization/quantization/mixin.py b/src/llmcompressor/modifiers/quantization/quantization/mixin.py
@@ -34,6 +34,9 @@
     reset_quantization_status,
 )
 from llmcompressor.modifiers.utils.hooks import HooksMixin
+from llmcompressor.transformers.compression.compressed_tensors_utils import (
+    untie_if_target_shared_embedding,
+)
 
 __all__ = ["QuantizationMixin"]
 
@@ -179,6 +182,12 @@ def start_calibration(self, model: torch.nn.Module):
 
         :param model: model to prepare for calibration
         """
+
+        matched_module_generator = (
+            x[1] for x in match_named_modules(model, self.resolved_targets, self.ignore)
+        )
+        untie_if_target_shared_embedding(model, matched_module_generator)
+
         for _, module in match_named_modules(model, self.resolved_targets, self.ignore):
             self._initialize_observers(module)
             self._calibration_hooks |= self._initialize_hooks(module)
diff --git a/src/llmcompressor/modifiers/transform/quip/base.py b/src/llmcompressor/modifiers/transform/quip/base.py
@@ -7,11 +7,14 @@
     TransformScheme,
     apply_transform_config,
 )
-from compressed_tensors.utils import TorchDtype
+from compressed_tensors.utils import TorchDtype, match_named_modules
 from pydantic import Field, ValidationInfo, field_validator
 
 from llmcompressor.core import Event, EventType, State
 from llmcompressor.modifiers import Modifier
+from llmcompressor.transformers.compression.compressed_tensors_utils import (
+    untie_if_target_shared_embedding,
+)
 
 __all__ = ["QuIPModifier"]
 
@@ -100,6 +103,16 @@ def on_initialize(self, state: State, **kwargs) -> bool:
     def on_start(self, state: State, event: Event, **kwargs):
         self.started_ = True
 
+        def matched_module_generator():
+            for scheme in self.transform_config.config_groups.values():
+                for arg in scheme.apply:
+                    gen = match_named_modules(state.model, arg.targets, arg.ignore)
+                    for _, module in gen:
+                        yield module
+
+        # Untie embeddings if they will be targeted by transforms
+        untie_if_target_shared_embedding(state.model, matched_module_generator())
+
         apply_transform_config(state.model, self.transform_config)
 
     def on_event(self, state: State, event: Event, **kwargs):
diff --git a/src/llmcompressor/modifiers/transform/spinquant/base.py b/src/llmcompressor/modifiers/transform/spinquant/base.py
@@ -16,6 +16,9 @@
 from llmcompressor.core import Event, EventType, State
 from llmcompressor.modeling import center_embeddings, fuse_norm_linears
 from llmcompressor.modifiers import Modifier
+from llmcompressor.transformers.compression.compressed_tensors_utils import (
+    untie_word_embeddings,
+)
 
 from .mappings import SpinQuantMapping, infer_mapping_from_model
 from .norm_mappings import NormMapping, infer_norm_mapping_from_model
@@ -148,6 +151,8 @@ def on_initialize(self, state: State, **kwargs) -> bool:
     def on_start(self, state: State, event: Event, **kwargs):
         self.started_ = True
 
+        # needed any time embeddings/lm_head is modified
+        untie_word_embeddings(state.model)
         # needs to happen after the model has been hooked to execute on the GPU
         # otherwise we're applying weight transforms on CPU
         self._center_embeddings(state.model)
diff --git a/src/llmcompressor/transformers/compression/compressed_tensors_utils.py b/src/llmcompressor/transformers/compression/compressed_tensors_utils.py
@@ -1,5 +1,6 @@
 import os
 import weakref
+from collections.abc import Generator
 from functools import wraps
 from typing import Optional
 
@@ -126,8 +127,15 @@ def untie_word_embeddings(model: PreTrainedModel):
 
     :param model: model to fix
     """
-    input_embed = model.get_input_embeddings()
-    output_embed = model.get_output_embeddings()
+    try:
+        input_embed = model.get_input_embeddings()
+        output_embed = model.get_output_embeddings()
+    except NotImplementedError as e:
+        logger.warning(
+            f"cannot untie model of type {model.__class__} which doesn't have "
+            f"get_input_embeddings and get_output_embeddings implmented\n{e}"
+        )
+        return
 
     for module in (input_embed, output_embed):
         if module is None or not hasattr(module, "weight"):
@@ -149,6 +157,80 @@ def untie_word_embeddings(model: PreTrainedModel):
         model.config.tie_word_embeddings = False
 
 
+def _get_embeddings_or_warn(
+    model: torch.nn.Module,
+) -> tuple[torch.nn.Module | None, torch.nn.Module | None]:
+    if not (
+        hasattr(model, "get_input_embeddings")
+        and hasattr(model, "get_output_embeddings")
+    ):
+        logger.warning(
+            f"{model.__class__} doesn't have attribute get_input_embeddings and"
+            " get_output_embeddings implemented."
+            "\nThis can cause"
+            " problems when quantizing layers with shared weights"
+        )
+        return None, None
+
+    try:
+        input_embeddings, output_embeddings = (
+            model.get_input_embeddings(),
+            model.get_output_embeddings(),
+        )
+    except NotImplementedError as e:
+        logger.warning(
+            f"{model.__class__} doesn't have get_input_embeddings and "
+            "get_output_embeddings implemented."
+            "\nThis can cause"
+            " problems when quantizing layers with shared weights"
+            f"\n{e}"
+        )
+        return None, None
+
+    if not (
+        isinstance(input_embeddings, torch.nn.Module)
+        and isinstance(output_embeddings, torch.nn.Module)
+    ):
+        logger.warning(
+            f"expected modules from {model.__class__} get_input_embeddings and"
+            f" get_output_embeddings but got {type(input_embeddings)}"
+            f"  and {type(output_embeddings)}."
+            "\nThis can cause"
+            " problems when quantizing layers with shared weights"
+        )
+        return None, None
+    return input_embeddings, output_embeddings
+
+
+def untie_if_target_shared_embedding(
+    model: torch.nn.Module, matched_module_generator: Generator[torch.nn.Module]
+):
+    """
+    Helper method that checks for shared input/output embedding and unties them
+    if either shows up in the matched_module_generator
+
+    :param model: model to untie if embeddings are shared and targeted by
+        matched_module_generator
+    :param matched_module_generator: Generator of all modules (not names) which
+            will be modified by quantization or transformation
+    """
+    input_embeddings, output_embeddings = _get_embeddings_or_warn(model)
+
+    if None in (input_embeddings, output_embeddings):  # if couldn't find embeddings
+        return
+
+    if (
+        input_embeddings.weight is not output_embeddings.weight
+    ):  # if not shared, can ignore
+        return
+
+    # if shared, check if either is targeted
+    for module in matched_module_generator:
+        if module in (input_embeddings, output_embeddings):
+            untie_word_embeddings(model)
+            return
+
+
 def get_model_compressor(
     model: torch.nn.Module,
     sparsity_config: Optional[SparsityCompressionConfig] = None,
diff --git a/tests/llmcompressor/modifiers/quantization/test_handling_shared_embeddings.py b/tests/llmcompressor/modifiers/quantization/test_handling_shared_embeddings.py
diff --git a/tests/llmcompressor/modifiers/transform/test_correctness.py b/tests/llmcompressor/modifiers/transform/test_correctness.py
diff --git a/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py b/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py

Original file line number	Diff line number	Diff line change
`@@ -64,10 +64,11 @@ class ModelArguments:`
`64`	`64`	`)`
`65`	`65`
`66`	`66`	`tie_word_embeddings: bool = field(`
`67`		`- default=False,`
	`67`	`+ default=True,`
`68`	`68`	`metadata={`
`69`	`69`	`"help": "Whether the model's input and output word embeddings "`
`70`		`- "should be tied. Note that this is only relevant if the "`
	`70`	`+ "should attempt to be left tied. False means always untie."`
	`71`	`+ " Note that this is only relevant if the "`
`71`	`72`	`"model has a output word embedding layer."`
`72`	`73`	`},`
`73`	`74`	`)`