Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from datasets import load_dataset
- from transformers import AutoModelForCausalLM, AutoTokenizer
- from llmcompressor import oneshot
- from llmcompressor.modifiers.awq import AWQModifier
- from llmcompressor.utils import dispatch_for_generation
- # Select model and load it.
- MODEL_ID = "/mnt/llms/models/zai-org/GLM-4.6"
- SAVE_DIR = MODEL_ID.split("/")[-1] + "-W4A16-awq"
- # Configure the quantization algorithm to run.
- # GLM-4.6 is a MoE model with:
- # - Layers 0-2: Dense layers
- # - Layers 3-92: MoE layers (160 routed experts + 1 shared expert per layer)
- # We need to ignore the MoE gate layers to preserve routing quality
- recipe = [
- AWQModifier(
- ignore=["lm_head", "re:.*mlp\\.gate$"], # Ignore output head and MoE gates
- scheme="W4A16",
- targets=["Linear"],
- ),
- ]
- # Select calibration dataset.
- # Using a chat dataset since GLM-4.6 is an instruct model
- DATASET_ID = "HuggingFaceH4/ultrachat_200k"
- DATASET_SPLIT = "train_sft"
- # Select number of samples. 256 samples is a good place to start.
- # Increasing the number of samples can improve accuracy.
- NUM_CALIBRATION_SAMPLES = 256
- MAX_SEQUENCE_LENGTH = 2048
- def get_calib_dataset(tokenizer):
- """Prepare calibration dataset for GLM-4.6"""
- ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
- ds = ds.shuffle(seed=42)
- def preprocess(example):
- return {
- "text": tokenizer.apply_chat_template(
- example["messages"],
- tokenize=False,
- )
- }
- ds = ds.map(preprocess)
- return ds
- if __name__ == "__main__":
- # Load model and tokenizer
- print(f"Loading model: {MODEL_ID}")
- model = AutoModelForCausalLM.from_pretrained(
- MODEL_ID,
- torch_dtype="auto",
- trust_remote_code=True
- )
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
- # Apply AWQ quantization
- print("Starting AWQ quantization...")
- oneshot(
- model=model,
- dataset=get_calib_dataset(tokenizer),
- recipe=recipe,
- max_seq_length=MAX_SEQUENCE_LENGTH,
- num_calibration_samples=NUM_CALIBRATION_SAMPLES,
- )
- # Confirm generations of the quantized model look sane.
- print("\n\n")
- print("========== SAMPLE GENERATION ==============")
- dispatch_for_generation(model)
- input_ids = tokenizer("Write a Python function to calculate fibonacci numbers", return_tensors="pt").input_ids.to(
- model.device
- )
- output = model.generate(input_ids, max_new_tokens=150)
- print(tokenizer.decode(output[0]))
- print("==========================================\n\n")
- # Save to disk compressed.
- print(f"Saving quantized model to: {SAVE_DIR}")
- model.save_pretrained(SAVE_DIR, save_compressed=True)
- tokenizer.save_pretrained(SAVE_DIR)
- print("Done!")
Advertisement
Add Comment
Please, Sign In to add comment