Advertisement
underlines

oobabooga text-generation-webui with llama.cpp GPU support on Windows via WSL2

May 20th, 2023
2,719
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 4.67 KB | Software | 0 0
  1. # 1 install WSL2 on Windows 11, then:
  2. sudo apt update
  3. sudo apt-get install build-essential
  4. sudo apt install git -y
  5.  
  6. # optional: install a better terminal experience, otherwise skip to step 4
  7. # 2 install brew
  8. /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
  9. (echo; echo 'eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)"') >> /home/$USER/.bashrc
  10. eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)"
  11. brew doctor
  12.  
  13. # 3 install oh-my-posh
  14. brew install jandedobbeleer/oh-my-posh/oh-my-posh
  15. $(brew --prefix oh-my-posh)/themes
  16. #   copy the path and add it below to the second eval line:
  17. sudo nano ~/.bashrc
  18. #   add this to the end:
  19. #       eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)"
  20. #       eval "$(oh-my-posh init bash --config '/home/linuxbrew/.linuxbrew/opt/oh-my-posh/themes/atomic.omp.json')"
  21. #          plugins=(
  22. #            git
  23. #            # other plugins
  24. #          )
  25. #   CTRL+X to end editing
  26. #   Y to save changes
  27. #   ENTER to finally exit
  28. source ~/.bashrc
  29. exec bash
  30.  
  31. # 4 install mamba instead of conda, because it's faster https://mamba.readthedocs.io/en/latest/installation.html
  32. mkdir github
  33. mkdir downloads
  34. cd downloads
  35. wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh
  36. bash Mambaforge-$(uname)-$(uname -m).sh
  37.  
  38. # 5 install the correct cuda toolkit 11.7, not 12.x
  39. wget https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda_11.7.0_515.43.04_linux.run
  40. sudo sh cuda_11.7.0_515.43.04_linux.run
  41. naon ~/.bashrc
  42. #   add the following line, in order to add the cuda library to the environment variable
  43. #       export LD_LIBRARY_PATH=/usr/lib/wsl/lib:$LD_LIBRARY_PATH
  44. #   after the plugins=() code block, above conda initialize
  45. #   CTRL+X to end editing
  46. #   Y to save changes
  47. #   ENTER to finally exit
  48. source ~/.bashrc
  49. cd ..
  50.  
  51. # 6 install ooba's textgen
  52. mamba create --name textgen python=3.10.9
  53. mamba activate textgen
  54. pip install torch==2.0.1+cu117 torchvision==0.15.2+cu117 torchaudio -f https://download.pytorch.org/whl/cu117/torch_stable.html
  55. git clone https://github.com/oobabooga/text-generation-webui
  56. cd text-generation-webui
  57. pip install -r requirements.txt
  58.  
  59. # 7 Install 4bit support through GPTQ-for-LLaMa
  60. mkdir repositories
  61. cd repositories
  62. # choose ONE of the following:
  63. # A) for fast triton https://www.reddit.com/r/LocalLLaMA/comments/13g8v5q/fastest_inference_branch_of_gptqforllama_and/
  64.     git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa -b fastest-inference-4bit
  65. # B) for triton
  66.     git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa -b triton
  67. # C) for newer cuda
  68.     git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa -b cuda
  69. # D) for widely compatible old cuda
  70.     git clone https://github.com/oobabooga/GPTQ-for-LLaMa.git -b cuda
  71. # groupsize, act-order, true-sequential
  72. #   --act-order (quantizing columns in order of decreasing activation size)
  73. #   --true-sequential (performing sequential quantization even within a single Transformer block)
  74. #   Those fix GPTQ's strangely bad performance on the 7B model (from 7.15 to 6.09 Wiki2 PPL) and lead to slight improvements on most models/settings in general.
  75. #   --groupsize
  76. #   Currently, groupsize and act-order do not work together and you must choose one of them.
  77. #   Ooba: There is a pytorch branch from qwop, that allows you to use groupsize and act-order together.
  78. #   Models without group-size (better for the 7b model)
  79. #   Models with group-size (better from 13b upwards)
  80. cd GPTQ-for-LLaMa
  81. pip install -r requirements.txt
  82. python setup_cuda.py install
  83. cd ..
  84. cd ..
  85.  
  86. # 8 Test ooba with a 4bit GPTQ model
  87. python download-model.py 4bit/WizardLM-13B-Uncensored-4bit-128g
  88. python server.py --wbits 4 --model_type llama --groupsize 128 --chat
  89.  
  90. # 9 install llama.cpp
  91. cd repositories
  92. git clone https://github.com/ggerganov/llama.cpp
  93. cd llama.cpp
  94. nano ~/.bashrc
  95. #   add the cuda bin folder to the path environment variable in order for make to find nvcc:
  96. #       export PATH=/usr/local/cuda/bin:$PATH
  97. #   after the export LD_LIBRARY_PATH line
  98. #   CTRL+X to end editing
  99. #   Y to save changes
  100. #   ENTER to finally exit
  101. source ~/.bashrc
  102. make LLAMA_CUBLAS=1
  103. cd models
  104. wget https://huggingface.co/TheBloke/WizardLM-13B-Uncensored-GGML/resolve/main/wizardLM-13B-Uncensored.ggmlv3.q4_0.bin
  105. cd ..
  106.  
  107. # 10 test llama.cpp with GPU support
  108. ./main -t 8 -m models/wizardLM-13B-Uncensored.ggmlv3.q4_0.bin --color -c 2048 --temp 0.7 --repeat_penalty 1.1 -n -1 -p "### Instruction: write a story about llamas ### Response:" --n-gpu-layers 30
  109. cd ..
  110. cd ..
  111.  
  112. # 11 prepare ooba's textgen for llama.cpp support, by compiling llama-cpp-python with cuda GPU support
  113. pip uninstall -y llama-cpp-python
  114. CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement