-
Notifications
You must be signed in to change notification settings - Fork 0
/
launcher.sh
144 lines (130 loc) · 4.8 KB
/
launcher.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/bin/bash
set -x
CONDA_ENV=llama2-chat
FILE=./models/best_model.pt
PHYSICAL_CORES=56
function main {
init_params "$@"
start_app
}
# init params
function init_params {
script="app/app.py"
port="9000"
auth_token=""
model_id="meta-llama/Llama-2-7b-chat-hf"
window_len=5
dtype="float32"
device="cpu"
max_new_tokens=32
prompt="Once upon time, there was"
num_warmup=15
alpha="auto"
extra_cmd=""
for var in "$@"
do
case $var in
--script=*)
script=$(echo $var | cut -f2 -d=)
;;
--port=*)
port=$(echo $var | cut -f2 -d=)
;;
--auth_token=*)
auth_token=$(echo $var | cut -f2 -d=)
;;
--model_id=*)
model_id=$(echo $var | cut -f2 -d=)
;;
--window_len=*)
window_len=$(echo $var | cut -f2 -d=)
;;
--dtype=*)
dtype=$(echo $var | cut -f2 -d=)
;;
--device=*)
device=$(echo $var | cut -f2 -d=)
;;
--max_new_tokens=*)
max_new_tokens=$(echo $var | cut -f2 -d=)
;;
--prompt=*)
prompt=$(echo $var | cut -f2 -d=)
;;
--num_warmup=*)
num_warmup=$(echo $var | cut -f2 -d=)
;;
--alpha=*)
alpha=$(echo $var | cut -f2 -d=)
;;
--physical_cores=*)
PHYSICAL_CORES=$(echo $var | cut -f2 -d=)
;;
--ipex)
extra_cmd=$extra_cmd" --ipex"
;;
--jit)
extra_cmd=$extra_cmd" --jit"
;;
--sq)
extra_cmd=$extra_cmd" --sq"
;;
--int4)
extra_cmd=$extra_cmd" --int4"
;;
*)
echo "Error: No such parameter: ${var}"
exit 1
;;
esac
done
}
function start_app {
if [[ $extra_cmd =~ "--ipex" ]]; then
echo -e '\n[INFO]: Installing IPEX llm branch..\n'
python3 -m pip install torch==2.1.0.dev20230711+cpu torchvision==0.16.0.dev20230711+cpu torchaudio==2.1.0.dev20230711+cpu --index-url https://download.pytorch.org/whl/nightly/cpu
python3 -m pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.1.0.dev0%2Bcpu.llm-cp39-cp39-linux_x86_64.whl
conda install -y libstdcxx-ng=12 -c conda-forge
fi
# Check if there is a quantized model, when user selects smooth quantization
if [[ $extra_cmd =~ "--sq" ]]; then
echo -e '\n[INFO]: Re-installing app requirements to ensure PyTorch version is compatible..\n'
pip install -r requirements.txt
if ! [ -f "$FILE" ]; then
echo -e '\n[INFO]: Quantized model not detected, launching SmoothQuant process..\n'
conda create -y -n smoothquant python=3.9 && eval "$(conda shell.bash hook)" && conda activate smoothquant && pip install -r smoothquant/requirements.txt
python3 smoothquant/run_generation.py --model ${model_id} --alpha ${alpha} --auth_token ${auth_token} --quantize --sq --ipex
echo -e '\n[INFO]: Starting SmoothQuant performance evaluation..\n'
python3 smoothquant/run_generation.py --model ${model_id} --auth_token ${auth_token} --benchmark --ipex --int8
echo -e '\n[INFO]: Starting SmoothQuant accuracy evaluation..\n'
# TODO: Fix evaluation issue
# python3 smoothquant/run_generation.py --model ${model_id} --auth_token ${auth_token} --batch_size 112 --accuracy --int8 --ipex
conda deactivate && conda env remove -n smoothquant && conda activate $CONDA_ENV
fi
fi
# Setup environment variables for performance on Xeon
export LD_PRELOAD=${CONDA_PREFIX}/lib/libstdc++.so.6
export KMP_BLOCKTIME=INF
export KMP_TPAUSE=0
export KMP_SETTINGS=1
export KMP_AFFINITY=granularity=fine,compact,1,0
export KMP_FORJOIN_BARRIER_PATTERN=dist,dist
export KMP_PLAIN_BARRIER_PATTERN=dist,dist
export KMP_REDUCTION_BARRIER_PATTERN=dist,dist
export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libiomp5.so # Intel OpenMP
export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
export OMP_NUM_THREADS=${PHYSICAL_CORES}
echo -e '\n[INFO]: Starting streamlit app..\n'
numactl -m 0 -C 0-$(($PHYSICAL_CORES-1)) streamlit run ${script} \
--server.port=${port} -- \
--auth_token=${auth_token} \
--model_id=${model_id} \
--window_len=${window_len} \
--dtype=${dtype} \
--device=${device} \
--max_new_tokens=${max_new_tokens} \
--prompt="${prompt}" \
--num_warmup=${num_warmup} \
${extra_cmd}
}
main "$@"