NVIDIA
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/puzzletron/GPTOSS.md‎
Lines changed: 14 additions & 0 deletions b/‎examples/puzzletron/GPTOSS.md‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎examples/puzzletron/README.md‎
Lines changed: 1 addition & 1 deletion b/‎examples/puzzletron/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/puzzletron/configs/gptoss-20b_remove_experts_memory/gptoss-20b.yaml‎
Lines changed: 110 additions & 0 deletions b/‎examples/puzzletron/configs/gptoss-20b_remove_experts_memory/gptoss-20b.yaml‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎examples/puzzletron/configs/gptoss-20b_remove_experts_memory/gptoss-20b_remove_experts_memory.yaml‎
Lines changed: 17 additions & 0 deletions b/‎examples/puzzletron/configs/gptoss-20b_remove_experts_memory/gptoss-20b_remove_experts_memory.yaml‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎examples/puzzletron/configs/gptoss-20b_remove_experts_memory/pruning/ffn_pruning.yaml‎
Lines changed: 21 additions & 0 deletions b/‎examples/puzzletron/configs/gptoss-20b_remove_experts_memory/pruning/ffn_pruning.yaml‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎examples/puzzletron/configs/gptoss-20b_remove_experts_memory/pruning/pruning_defaults.yaml‎
Lines changed: 34 additions & 0 deletions b/‎examples/puzzletron/configs/gptoss-20b_remove_experts_memory/pruning/pruning_defaults.yaml‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎examples/puzzletron/configs/gptoss-20b_remove_experts_memory/validate_model_defaults.yaml‎
Lines changed: 18 additions & 0 deletions b/‎examples/puzzletron/configs/gptoss-20b_remove_experts_memory/validate_model_defaults.yaml‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎examples/puzzletron/configs/gptoss-20b_remove_experts_memory/validate_solutions_defaults.yaml‎
Lines changed: 11 additions & 0 deletions b/‎examples/puzzletron/configs/gptoss-20b_remove_experts_memory/validate_solutions_defaults.yaml‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎examples/puzzletron/evaluation/hf_deployable_anymodel.py‎
Lines changed: 36 additions & 13 deletions b/‎examples/puzzletron/evaluation/hf_deployable_anymodel.py‎
Lines changed: 36 additions & 13 deletions
@@ -109,7 +109,8 @@ repos:
               examples/speculative_decoding/main.py|
               examples/speculative_decoding/medusa_utils.py|
               examples/speculative_decoding/server_generate.py|
-              examples/puzzletron/evaluation/hf_deployable_anymodel\.py|
+              examples/puzzletron/evaluation/lm_eval_anymodel.py|
+              modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_pruned_to_mxfp4.py|
               modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_.*\.py|
           )$
 
 
@@ -0,0 +1,14 @@
+
+## GptOss - 20b
+
+With this release Puzzle algorithm supports only experts removal for `Gpt-Oss-20b`.
+
+This model comes as a quantized checkpoint i.e. MoE experts matrices are quantized with _MXFP4_ format.
+In the prunning steps puzzle utilizes decompressed model (back to BF16) for statistics and scores computation.
+This means, during the conversion to puzzle format we decompress the model and store it as a BF16.
+Once the pruning is done i.e. experts to be removed are identified and the process is finished, user may want to get back the _MXFP4_ format of the checkpoint.
+To do so, there is an additional script, that takes the original and the pruned checkpoint and outputs pruned checkpoint in _MXFP4_ format.
+
+```bash
+python -m modelopt.torch.puzzletron.anymodel.models.gpt_oss_20b.gpt_oss_pruned_to_mxfp4 --student-path /workspaces/any_model_gpt_oss_20b/mip/puzzle_solutions/stats_num_params_18014757184/solutions--checkpoints/solution_0/ --original-path /workspaces/source_model_checkpoints/openai_gpt-oss-20b/ --output-path /workspaces/any_model_gpt_oss_20b/mip/puzzle_solutions/stats_num_params_18014757184/solutions--checkpoints/mxfp4-ckpt/  --num-layers 24
+```
@@ -9,7 +9,7 @@ The supported modifications are:
 
 To use the Puzzle algorithm effectively, we need to specify the target number of parameters and/or the memory. The final stage is based on Mixed-Integer Programming (MIP) algorithm to find the most optimal combination of layer modifications that satisfy the target requirements.
 
-In this example, we compress the [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model reducing GPU memory usage from 113 GiB to 96 GiB (15% reduction) with less than 1% regression in the token_accuracy_top_10 metric.
+In this example, we compress the [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model reducing GPU memory usage from 113 GiB to 96 GiB (15% reduction) with less than 1% regression in the token_accuracy_top_10 metric. Other supported models should be compressed in a similar way. For GptOss there is one [additional step to be performed](GPTOSS.md).
 
 > **Note:** Other models are also supported. See the [configs](./configs/) directory for additional model configurations (e.g., Llama-3.2-3B-Instruct on 1x H100, Qwen2.5-7B-Instruct on 1x H100, Qwen3-8B on 1x H100, Nemotron-Nano-12B-v2 on 1x H100, Mistral-Small-24B-Instruct-2501 on 4x H100). For information on adding support for new models, see the [AnyModel Guide](../../modelopt/torch/puzzletron/anymodel/README.md).
 
 
@@ -0,0 +1,110 @@
+defaults:
+  - pruning: ffn_pruning
+  - scoring: ../validate_solutions_defaults
+  - realize_model: ../validate_solutions_defaults
+  - bypass:
+  - override hydra/hydra_logging: disabled
+  - _self_
+
+puzzle_dir: ???
+descriptor: gpt_oss_20b
+teacher_dir: ${puzzle_dir}/ckpts/teacher/
+replacement_library_path: ${puzzle_dir}/replacement_library.json
+dataset_path: ??? # path to Nemotron-Post-Training-Dataset-v2
+
+skip_realize_model: false
+
+build_replacement_library:
+  add_ffn_no_ops: true
+  add_attention_no_ops: true
+
+calc_subblock_stats:
+  batch_sizes: [64, 96, 128]
+  prefill_seq_len: 4096
+  generation_seq_len: 4096
+  num_active_tokens_override: # Optional override for sequence lengths
+  prefill_queue_size: 0
+  allocate_prefill_query: false
+  benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking
+  merge_with_existing_stats: false
+  subblock_stats_filename: "subblock_stats.json"
+  moe_stats_filename: "moe_stats.json"
+  runtime_stats:
+    backend: trt_torch
+
+scoring:
+  descriptor: ${descriptor}
+  solutions_to_validate:
+  skip_existing_solutions: true
+
+  replacement_library_path: ${replacement_library_path}
+  solutions_path: ${to_path:${puzzle_dir}/single_sequence_replacement_solutions.json}
+  teacher_dir: ${to_path:${teacher_dir}}
+  output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation
+
+  eval_samples: 128
+  micro_batch_size: 1
+  seed: 42
+  shuffle_seed: 444
+  dataset_path: ${dataset_path}
+
+mip:
+  single_block_replacement_validation_dir: ${to_path:${scoring.output_dir}}
+  subblock_stats_path: ${to_path:${puzzle_dir}/${calc_subblock_stats.subblock_stats_filename}}
+  output_path: ${to_path:${puzzle_dir}/mip/puzzle_solutions}
+  gathered_metrics_path:
+  puzzle_profile:
+
+  # puzzle_profile:
+  objective: metrics.cosine_embedding_loss_hidden_states
+  bigger_is_better: false
+
+  subblock_stats_args:
+    - batch_size: 96
+      weights_dtype: torch.bfloat16
+      activations_dtype: torch.bfloat16
+      kv_cache_dtype: torch.bfloat16
+
+  report_additional_costs:
+    - stats.memory_mib
+    - stats.num_params
+    - stats.num_kv_heads
+    - stats.has_attention
+    - stats.has_ffn
+    - stats.kv_cache_memory_mib
+    - stats.attention_memory_mib
+    - stats.ffn_memory_mib
+    - stats.ffn_num_params
+    - stats.attention_num_params
+
+  human_constraints:
+    target_memory: 45_000
+    num_params: 3_000_000_000
+
+  mip_constraints:
+  metric_overrides:
+  max_seconds_per_solution: 60
+
+realize_model:
+  descriptor: ${descriptor}
+  teacher_dir: ${to_path:${teacher_dir}}
+  tokenizer_name: ${to_path:${teacher_dir}}
+  replacement_library_path: ${replacement_library_path}
+  save_models: true
+  solutions_path: # Filled dynamically
+
+  # Validate params
+  skip_validation: false # To enable validation of the model solution set `skip_validation` as False
+  eval_samples: 128
+  micro_batch_size: 1
+  seed: 42
+  shuffle_seed: 444
+  dataset_path: ${dataset_path}
+
+nccl_timeout_minutes: ${timedelta_minutes:10}
+
+# This section redirects Hydra outputs
+hydra:
+  run:
+    dir: ${puzzle_dir}/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+
@@ -0,0 +1,17 @@
+defaults:
+  - gptoss-20b
+  - _self_
+
+# Input Hugging Face model to compress
+input_hf_model_path: /workspace/hf_models/openai/gpt-oss-20b
+
+# Dataset path for pruning and NAS scoring
+dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2
+
+# Working directory for compression outputs
+puzzle_dir: /workspace/puzzle_dir
+
+# MIP memory constraint (in MiB)
+mip:
+  human_constraints:
+    target_memory: 16_000 # 45 GiB
@@ -0,0 +1,21 @@
+defaults:
+  - pruning_defaults
+
+eval_samples: 2500 #10
+activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/expert_removal/${pruning.experiment_id}
+
+pruning_mixin:
+  _target_: modelopt.torch.puzzletron.pruning.expert_removal_pruning_mixin.ExpertRemovalPruningMixIn
+  layer_descriptor:
+    _target_: modelopt.torch.puzzletron.anymodel.models.gpt_oss_20b.gpt_oss_20b_model_descriptor.GptOss20bExpertRemovalLayerDescriptor
+    target_name: "mlp.router"
+
+hook_class: ${get_object:modelopt.torch.nas.plugins.megatron_hooks.base_hooks.RankedChoiceVotingHook}
+activation_hooks_kwargs:    # Additional kwargs to pass to the hook init
+
+num_experts_to_keep_list: [24, 16, 8]  # num_experts in teacher is 128
+mlp_init_mode: "ExpertRemoval"
+mlp_init_config_yaml:
+  expert_scores_key: "expert_ranks"
+  layer_prefix_template: "model.layers.{layer_idx}.mlp.router"
+
@@ -0,0 +1,34 @@
+defaults:
+  - /validate_model_defaults
+
+model_name_or_path: ${teacher_dir}
+experiment_id: ${pruning.eval_samples}samples_diverse_mini
+activations_log_dir: ???
+activation_hooks_kwargs: ???
+
+descriptor: ${descriptor}
+
+# Data:
+eval_samples: 10_000
+micro_batch_size: 1
+dataset_path: ${dataset_path}
+val_dataset_name: train
+
+# Prune ckpts
+pruned_ckpts_output_dir: ${puzzle_dir}/pruning/${pruning.experiment_id}
+
+## FFN pruning
+ffn_list:
+mlp_init_mode: "Truncate" # PruneByActivationsLog
+
+## KV-heads pruning
+n_heads_in_group_list:
+gqa_init_mode: "AverageKV"
+
+## Hidden dimension pruning
+hidden_size_list:
+hidden_size_init_mode: "PruneByChannelRanking"
+linear_init_mode: "FromTeacher"
+
+mlp_init_config_yaml:
+  activations_log_dir: ${pruning.activations_log_dir}
@@ -0,0 +1,18 @@
+model_dtype: torch.bfloat16 # dtype to cast the model for validate_model
+autocast_dtype: torch.bfloat16 # dtype for torch.autocast for validate_model
+block_size: 8192
+bos_rate: 0.5
+data_column: messages
+val_dataset_name: valid
+shuffle_seed: 81436
+seed: 42
+fim_rate: 0
+fim_spm_rate: 0
+source_datasets_to_discard:
+varlen: false
+write_results: false
+calc_losses_on_cpu: false
+activations_log_dir:
+model_name_or_path:
+load_dataset_fn: ${get_object:modelopt.torch.puzzletron.utils.data.dataloaders.load_from_disk_fn}
+
@@ -0,0 +1,11 @@
+defaults:
+  - /validate_model_defaults
+  - _self_
+
+solutions_to_validate:
+skip_validation: false
+save_models: false
+bigger_is_better: false
+sort_solutions_by:
+calculate_full_score_ablations: false
+
@@ -1,10 +1,35 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+# Adapted from https://github.com/EleutherAI/lm-evaluation-harness/tree/aa457edc3d64d81530159cd3a182932320c78f8c
+
+# MIT License
+#
+# Copyright (c) 2020 EleutherAI
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,6 +38,7 @@
 # limitations under the License.
 
 
+import json
 import logging
 from typing import Any
 
@@ -28,6 +54,11 @@
 from peft import PeftModel
 from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
 
+from modelopt.torch.puzzletron.anymodel.model_descriptor.model_descriptor_factory import (
+    resolve_descriptor_from_pretrained,
+)
+from modelopt.torch.puzzletron.anymodel.puzzformer import deci_x_patcher
+
 try:
     from pytriton.decorators import batch
     from pytriton.model_config import Tensor
@@ -139,18 +170,12 @@ def _load(
             # Wraps model loading with deci_x_patcher for heterogeneous layer configs.
             # See: modelopt/torch/puzzletron/anymodel/puzzformer/utils.py
             # =========================================================================
-            import os
-            import sys
 
-            modelopt_workdir = os.environ.get("MODELOPT_WORKDIR") or os.environ.get(
-                "PUZZLE_WORKDIR"
+            descriptor = resolve_descriptor_from_pretrained(
+                self.hf_model_id_path, trust_remote_code=hf_kwargs.get("trust_remote_code", False)
             )
-            if modelopt_workdir and modelopt_workdir not in sys.path:
-                sys.path.insert(0, modelopt_workdir)
-            from modelopt.torch.puzzletron.anymodel.models.llama import LlamaModelDescriptor
-            from modelopt.torch.puzzletron.anymodel.puzzformer import deci_x_patcher
 
-            with deci_x_patcher(model_descriptor=LlamaModelDescriptor):
+            with deci_x_patcher(model_descriptor=descriptor):
                 self.model = AutoModelForCausalLM.from_pretrained(
                     self.hf_model_id_path,
                     torch_dtype=torch_dtype,
@@ -587,8 +612,6 @@ def ray_infer_fn(self, inputs: dict[Any, Any]):
                 - log_probs: Optional list of log probabilities if compute_logprob is True
                 - top_logprobs: Optional list of top log probabilities if n_top_logprobs > 0
         """
-        import json
-
         try:
             prompts = inputs.pop("prompts")
             temperature = inputs.pop("temperature", 1.0)