Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # 1 install WSL2 on Windows 11, then:
- sudo apt update
- sudo apt-get install build-essential
- sudo apt install git -y
- # optional: install a better terminal experience, otherwise skip to step 4
- # 2 install brew
- /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
- (echo; echo 'eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)"') >> /home/$USER/.bashrc
- eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)"
- brew doctor
- # 3 install oh-my-posh
- brew install jandedobbeleer/oh-my-posh/oh-my-posh
- $(brew --prefix oh-my-posh)/themes
- # copy the path and add it below to the second eval line:
- sudo nano ~/.bashrc
- # add this to the end:
- # eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)"
- # eval "$(oh-my-posh init bash --config '/home/linuxbrew/.linuxbrew/opt/oh-my-posh/themes/atomic.omp.json')"
- # plugins=(
- # git
- # # other plugins
- # )
- # CTRL+X to end editing
- # Y to save changes
- # ENTER to finally exit
- source ~/.bashrc
- exec bash
- # 4 install mamba instead of conda, because it's faster https://mamba.readthedocs.io/en/latest/installation.html
- mkdir github
- mkdir downloads
- cd downloads
- wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh
- bash Mambaforge-$(uname)-$(uname -m).sh
- # 5 install the correct cuda toolkit 11.7, not 12.x
- wget https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda_11.7.0_515.43.04_linux.run
- sudo sh cuda_11.7.0_515.43.04_linux.run
- naon ~/.bashrc
- # add the following line, in order to add the cuda library to the environment variable
- # export LD_LIBRARY_PATH=/usr/lib/wsl/lib:$LD_LIBRARY_PATH
- # after the plugins=() code block, above conda initialize
- # CTRL+X to end editing
- # Y to save changes
- # ENTER to finally exit
- source ~/.bashrc
- cd ..
- # 6 install ooba's textgen
- mamba create --name textgen python=3.10.9
- mamba activate textgen
- pip install torch==2.0.1+cu117 torchvision==0.15.2+cu117 torchaudio -f https://download.pytorch.org/whl/cu117/torch_stable.html
- git clone https://github.com/oobabooga/text-generation-webui
- cd text-generation-webui
- pip install -r requirements.txt
- # 7 Install 4bit support through GPTQ-for-LLaMa
- mkdir repositories
- cd repositories
- # choose ONE of the following:
- # A) for fast triton https://www.reddit.com/r/LocalLLaMA/comments/13g8v5q/fastest_inference_branch_of_gptqforllama_and/
- git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa -b fastest-inference-4bit
- # B) for triton
- git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa -b triton
- # C) for newer cuda
- git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa -b cuda
- # D) for widely compatible old cuda
- git clone https://github.com/oobabooga/GPTQ-for-LLaMa.git -b cuda
- # groupsize, act-order, true-sequential
- # --act-order (quantizing columns in order of decreasing activation size)
- # --true-sequential (performing sequential quantization even within a single Transformer block)
- # Those fix GPTQ's strangely bad performance on the 7B model (from 7.15 to 6.09 Wiki2 PPL) and lead to slight improvements on most models/settings in general.
- # --groupsize
- # Currently, groupsize and act-order do not work together and you must choose one of them.
- # Ooba: There is a pytorch branch from qwop, that allows you to use groupsize and act-order together.
- # Models without group-size (better for the 7b model)
- # Models with group-size (better from 13b upwards)
- cd GPTQ-for-LLaMa
- pip install -r requirements.txt
- python setup_cuda.py install
- cd ..
- cd ..
- # 8 Test ooba with a 4bit GPTQ model
- python download-model.py 4bit/WizardLM-13B-Uncensored-4bit-128g
- python server.py --wbits 4 --model_type llama --groupsize 128 --chat
- # 9 install llama.cpp
- cd repositories
- git clone https://github.com/ggerganov/llama.cpp
- cd llama.cpp
- nano ~/.bashrc
- # add the cuda bin folder to the path environment variable in order for make to find nvcc:
- # export PATH=/usr/local/cuda/bin:$PATH
- # after the export LD_LIBRARY_PATH line
- # CTRL+X to end editing
- # Y to save changes
- # ENTER to finally exit
- source ~/.bashrc
- make LLAMA_CUBLAS=1
- cd models
- wget https://huggingface.co/TheBloke/WizardLM-13B-Uncensored-GGML/resolve/main/wizardLM-13B-Uncensored.ggmlv3.q4_0.bin
- cd ..
- # 10 test llama.cpp with GPU support
- ./main -t 8 -m models/wizardLM-13B-Uncensored.ggmlv3.q4_0.bin --color -c 2048 --temp 0.7 --repeat_penalty 1.1 -n -1 -p "### Instruction: write a story about llamas ### Response:" --n-gpu-layers 30
- cd ..
- cd ..
- # 11 prepare ooba's textgen for llama.cpp support, by compiling llama-cpp-python with cuda GPU support
- pip uninstall -y llama-cpp-python
- CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement