diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 49a1700a8..65d890158 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -117,6 +117,18 @@ jobs: cat results.github.env >> $GITHUB_ENV + # Remove dataset as we do not want it in the artefact + rm ShareGPT_V3_unfiltered_cleaned_split.json + + - uses: actions/upload-artifact@v4 + with: + name: benchmark-results + compression-level: 9 + path: | + examples/server/bench/*.png + examples/server/bench/*.json + examples/server/bench/*.log + - name: Commit status uses: Sibz/github-status-action@v1 with: @@ -128,6 +140,7 @@ jobs: - name: Upload benchmark images uses: devicons/public-upload-to-imgur@v2.2.2 + continue-on-error: true # Important as it looks unstable: 503 id: imgur_step with: client_id: ${{secrets.IMGUR_CLIENT_ID}} @@ -136,44 +149,95 @@ jobs: examples/server/bench/predicted_tokens_seconds.png examples/server/bench/kv_cache_usage_ratio.png examples/server/bench/requests_processing.png - examples/server/bench/requests_deferred.png + + - name: Extract mermaid + id: set_mermaid + run: | + set -eux + + cd examples/server/bench + PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid) + echo "PROMPT_TOKENS_SECONDS<> $GITHUB_ENV + echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid) + echo "PREDICTED_TOKENS_SECONDS<> $GITHUB_ENV + echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid) + echo "KV_CACHE_USAGE_RATIO<> $GITHUB_ENV + echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + REQUESTS_PROCESSING=$(cat requests_processing.mermaid) + echo "REQUESTS_PROCESSING<> $GITHUB_ENV + echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV - name: Comment PR uses: mshick/add-pr-comment@v2 id: comment_pr if: ${{ github.event.pull_request != '' }} + continue-on-error: true with: message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }} message: | 📈 **llama.cpp server** benchmark for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 + + - ${{ env.BENCH_GRAPH_XLABEL }} + - req_avg=${{ env.HTTP_REQ_DURATION_AVG }} pp_avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }} tks_avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }} + +

- prompt_tokens_seconds - predicted_tokens_seconds + prompt_tokens_seconds + +

+ More + + ```mermaid + ${{ env.PROMPT_TOKENS_SECONDS }} + ``` + +
+ + predicted_tokens_seconds + +
+ More + + ```mermaid + ${{ env.PREDICTED_TOKENS_SECONDS }} + ``` + +
+

Details

- kv_cache_usage_ratio - requests_processing - requests_deferred -

- + kv_cache_usage_ratio - - name: Upload results - if: ${{ github.event.pull_request }} - uses: edunad/actions-image@v2.0.0 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - path: 'examples/server/bench/*.png' - title: | - llama.cpp server benchmark results for ${{ github.job }} on ${{ env.RUNNER_LABEL }}: ${{ env.LLAMACPP_TOKENS_SECOND_AVG}}tk/s - annotationLevel: 'success' +
+ More - - uses: actions/upload-artifact@v4 - with: - name: benchmark-results - compression-level: 9 - path: | - examples/server/bench/*.png - examples/server/bench/*.json - examples/server/bench/*.log + ```mermaid + ${{ env.KV_CACHE_USAGE_RATIO }} + ``` + +
+ + requests_processing + +
+ More + + ```mermaid + ${{ env.REQUESTS_PROCESSING }} + ``` + +
+ +

+
diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py index 3a213cce9..cee972431 100644 --- a/examples/server/bench/bench.py +++ b/examples/server/bench/bench.py @@ -101,6 +101,12 @@ def main(args_in: list[str] | None = None) -> None: while is_server_listening(args.host, args.port): time.sleep(0.1) + title = (f"llama.cpp {args.name} on {args.runner_label}\n " + f"duration={args.duration} {iterations} iterations") + xlabel = (f"{args.hf_repo}/{args.hf_file}\n" + f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size} pp={args.max_prompt_tokens} pp+tg={args.max_tokens}\n" + f"branch={args.branch} commit={args.commit}") + # Prometheus end_time = time.time() if is_server_listening("0.0.0.0", 9090): @@ -121,23 +127,20 @@ def main(args_in: list[str] | None = None) -> None: values = metric_data['data']['result'][0]['values'] timestamps, metric_values = zip(*values) metric_values = [float(value) for value in metric_values] - timestamps = [datetime.fromtimestamp(int(ts)) for ts in timestamps] + timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps] plt.figure(figsize=(16, 10), dpi=80) - plt.plot(timestamps, metric_values, label=metric) + plt.plot(timestamps_dt, metric_values, label=metric) plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7) plt.yticks(fontsize=12, alpha=.7) - plt.title(f"llama.cpp {args.name} on {args.runner_label}\n" - f"duration={args.duration} {iterations} iterations", + ylabel = f"llamacpp:{metric}" + plt.title(title, fontsize=14, wrap=True) plt.grid(axis='both', alpha=.3) - plt.ylabel(f"llamacpp:{metric}", fontsize=22) - plt.xlabel(f"{args.hf_repo}/{args.hf_file}\n" - f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size}\n" - f"pp={args.max_prompt_tokens} pp+tg={args.max_tokens}\n" - f"branch={args.branch} commit={args.commit}", fontsize=14, wrap=True) + plt.ylabel(ylabel, fontsize=22) + plt.xlabel(xlabel, fontsize=14, wrap=True) plt.gca().xaxis.set_major_locator(matplotlib.dates.MinuteLocator()) - plt.gca().xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y%m%d %H:%M:%S")) + plt.gca().xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m-%d %H:%M:%S")) plt.gcf().autofmt_xdate() # Remove borders @@ -150,6 +153,27 @@ def main(args_in: list[str] | None = None) -> None: plt.savefig(f'{metric}.png') plt.close() + # Mermaid format in case image failed + with (open(f"{metric}.mermaid", 'w') as mermaid_f): + mermaid = ( + f"""--- +config: + xyChart: + titleFontSize: 12 + width: 900 + height: 600 + themeVariables: + xyChart: + titleColor: "#000000" +--- +xychart-beta + title "{title}" + y-axis "llamacpp:{metric}" + x-axis "llamacpp:{metric}" {int(min(timestamps))} --> {int(max(timestamps))} + line [{', '.join([str(round(float(value))) for value in metric_values])}] + """) + mermaid_f.write(mermaid) + # 140 chars max for commit status description bench_results = { "req": { @@ -169,6 +193,11 @@ def main(args_in: list[str] | None = None) -> None: github_env.write(f"BENCH_RESULTS={json.dumps(bench_results, indent=None, separators=(',', ':') )}\n") github_env.write(f"BENCH_ITERATIONS={iterations}\n") + title = title.replace('\n', ' ') + xlabel = xlabel.replace('\n', ' ') + github_env.write(f"BENCH_GRAPH_TITLE={title}\n") + github_env.write(f"BENCH_GRAPH_XLABEL={xlabel}\n") + def start_benchmark(args): k6_path = 'k6'