bullerwins

Untitled

Oct 1st, 2025
27
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.86 KB | None | 0 0
  1. from datasets import load_dataset
  2. from transformers import AutoModelForCausalLM, AutoTokenizer
  3.  
  4. from llmcompressor import oneshot
  5. from llmcompressor.modifiers.awq import AWQModifier
  6. from llmcompressor.utils import dispatch_for_generation
  7.  
  8. # Select model and load it.
  9. MODEL_ID = "/mnt/llms/models/zai-org/GLM-4.6"
  10. SAVE_DIR = MODEL_ID.split("/")[-1] + "-W4A16-awq"
  11.  
  12. # Configure the quantization algorithm to run.
  13. # GLM-4.6 is a MoE model with:
  14. # - Layers 0-2: Dense layers
  15. # - Layers 3-92: MoE layers (160 routed experts + 1 shared expert per layer)
  16. # We need to ignore the MoE gate layers to preserve routing quality
  17. recipe = [
  18. AWQModifier(
  19. ignore=["lm_head", "re:.*mlp\\.gate$"], # Ignore output head and MoE gates
  20. scheme="W4A16",
  21. targets=["Linear"],
  22. ),
  23. ]
  24.  
  25. # Select calibration dataset.
  26. # Using a chat dataset since GLM-4.6 is an instruct model
  27. DATASET_ID = "HuggingFaceH4/ultrachat_200k"
  28. DATASET_SPLIT = "train_sft"
  29.  
  30. # Select number of samples. 256 samples is a good place to start.
  31. # Increasing the number of samples can improve accuracy.
  32. NUM_CALIBRATION_SAMPLES = 256
  33. MAX_SEQUENCE_LENGTH = 2048
  34.  
  35.  
  36. def get_calib_dataset(tokenizer):
  37. """Prepare calibration dataset for GLM-4.6"""
  38. ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
  39. ds = ds.shuffle(seed=42)
  40.  
  41. def preprocess(example):
  42. return {
  43. "text": tokenizer.apply_chat_template(
  44. example["messages"],
  45. tokenize=False,
  46. )
  47. }
  48.  
  49. ds = ds.map(preprocess)
  50. return ds
  51.  
  52.  
  53. if __name__ == "__main__":
  54. # Load model and tokenizer
  55. print(f"Loading model: {MODEL_ID}")
  56. model = AutoModelForCausalLM.from_pretrained(
  57. MODEL_ID,
  58. torch_dtype="auto",
  59. trust_remote_code=True
  60. )
  61. tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
  62.  
  63. # Apply AWQ quantization
  64. print("Starting AWQ quantization...")
  65. oneshot(
  66. model=model,
  67. dataset=get_calib_dataset(tokenizer),
  68. recipe=recipe,
  69. max_seq_length=MAX_SEQUENCE_LENGTH,
  70. num_calibration_samples=NUM_CALIBRATION_SAMPLES,
  71. )
  72.  
  73. # Confirm generations of the quantized model look sane.
  74. print("\n\n")
  75. print("========== SAMPLE GENERATION ==============")
  76. dispatch_for_generation(model)
  77. input_ids = tokenizer("Write a Python function to calculate fibonacci numbers", return_tensors="pt").input_ids.to(
  78. model.device
  79. )
  80. output = model.generate(input_ids, max_new_tokens=150)
  81. print(tokenizer.decode(output[0]))
  82. print("==========================================\n\n")
  83.  
  84. # Save to disk compressed.
  85. print(f"Saving quantized model to: {SAVE_DIR}")
  86. model.save_pretrained(SAVE_DIR, save_compressed=True)
  87. tokenizer.save_pretrained(SAVE_DIR)
  88. print("Done!")
Advertisement
Add Comment
Please, Sign In to add comment