1 год назад · fa8a809a91
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -47,6 +47,8 @@ jobs:
 
															       - name: Clone
														
 
															         id: checkout
														
 
															         uses: actions/checkout@v3
														
 
															+        with:
														
 
															+          fetch-depth: 0
														
 
															       - name: Dependencies
														
 
															         id: depends
														
@@ -58,7 +60,6 @@ jobs:
 
															             cmake \
														
 
															             python3-pip \
														
 
															             wget \
														
 
															-            psmisc \
														
 
															             language-pack-en
														
 
															       - name: Build
														
@@ -90,3 +91,46 @@ jobs:
 
															         run: |
														
 
															           cd examples/server/tests
														
 
															           PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
														
 
															+
														
 
															+
														
 
															+  server-windows:
														
 
															+    runs-on: windows-latest
														
 
															+
														
 
															+    steps:
														
 
															+      - name: Clone
														
 
															+        id: checkout
														
 
															+        uses: actions/checkout@v3
														
 
															+        with:
														
 
															+          fetch-depth: 0
														
 
															+
														
 
															+      - name: Build
														
 
															+        id: cmake_build
														
 
															+        run: |
														
 
															+          mkdir build
														
 
															+          cd build
														
 
															+          cmake ..  -DLLAMA_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release ;
														
 
															+          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
														
 
															+
														
 
															+      - name: Python setup
														
 
															+        id: setup_python
														
 
															+        uses: actions/setup-python@v5
														
 
															+        with:
														
 
															+          python-version: '3.11'
														
 
															+
														
 
															+      - name: Tests dependencies
														
 
															+        id: test_dependencies
														
 
															+        run: |
														
 
															+          pip install -r examples/server/tests/requirements.txt
														
 
															+
														
 
															+      - name: Tests
														
 
															+        id: server_integration_tests
														
 
															+        run: |
														
 
															+          cd examples/server/tests
														
 
															+          behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
														
 
															+
														
 
															+      - name: Slow tests
														
 
															+        id: server_integration_tests_slow
														
 
															+        if: ${{ github.event.schedule != '' || github.event.inputs.slow_tests == 'true' }}
														
 
															+        run: |
														
 
															+          cd examples/server/tests
														
 
															+          behave.exe --stop --no-skipped --no-capture --tags slow
														
--- a/examples/server/tests/features/environment.py
+++ b/examples/server/tests/features/environment.py
@@ -1,9 +1,10 @@
 
															+import errno
														
 
															 import os
														
 
															 import socket
														
 
															 import subprocess
														
 
															 import time
														
 
															 from contextlib import closing
														
 
															-from signal import SIGKILL
														
 
															+import signal
														
 
															 def before_scenario(context, scenario):
														
@@ -29,44 +30,71 @@ def after_scenario(context, scenario):
 
															                     for line in f:
														
 
															                         print(line)
														
 
															         if not is_server_listening(context.server_fqdn, context.server_port):
														
 
															-            print("\x1b[33;101mERROR: Server stopped listening\x1b[0m")
														
 
															+            print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n")
														
 
															     if not pid_exists(context.server_process.pid):
														
 
															         assert False, f"Server not running pid={context.server_process.pid} ..."
														
 
															-    print(f"stopping server pid={context.server_process.pid} ...")
														
 
															-    context.server_process.kill()
														
 
															+    server_graceful_shutdown(context)
														
 
															+
														
 
															     # Wait few for socket to free up
														
 
															     time.sleep(0.05)
														
 
															     attempts = 0
														
 
															-    while is_server_listening(context.server_fqdn, context.server_port):
														
 
															-        print(f"stopping server pid={context.server_process.pid} ...")
														
 
															-        os.kill(context.server_process.pid, SIGKILL)
														
 
															+    while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port):
														
 
															+        server_kill(context)
														
 
															         time.sleep(0.1)
														
 
															         attempts += 1
														
 
															         if attempts > 5:
														
 
															-            print(f"Server dangling exits, killing all {context.server_path} ...")
														
 
															-            process = subprocess.run(['killall', '-9', context.server_path],
														
 
															-                                     stderr=subprocess.PIPE,
														
 
															-                                     universal_newlines=True)
														
 
															-            print(process)
														
 
															+            server_kill_hard(context)
														
 
															+
														
 
															+
														
 
															+def server_graceful_shutdown(context):
														
 
															+    print(f"shutting down server pid={context.server_process.pid} ...\n")
														
 
															+    if os.name == 'nt':
														
 
															+        os.kill(context.server_process.pid, signal.CTRL_C_EVENT)
														
 
															+    else:
														
 
															+        os.kill(context.server_process.pid, signal.SIGINT)
														
 
															+
														
 
															+
														
 
															+def server_kill(context):
														
 
															+    print(f"killing server pid={context.server_process.pid} ...\n")
														
 
															+    context.server_process.kill()
														
 
															+
														
 
															+
														
 
															+def server_kill_hard(context):
														
 
															+    pid = context.server_process.pid
														
 
															+    path = context.server_path
														
 
															+
														
 
															+    print(f"Server dangling exits, hard killing force {pid}={path}...\n")
														
 
															+    if os.name == 'nt':
														
 
															+        process = subprocess.check_output(['taskkill', '/F', '/pid', str(pid)]).decode()
														
 
															+        print(process)
														
 
															+    else:
														
 
															+        os.kill(-pid, signal.SIGKILL)
														
 
															 def is_server_listening(server_fqdn, server_port):
														
 
															     with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
														
 
															         result = sock.connect_ex((server_fqdn, server_port))
														
 
															-        return result == 0
														
 
															+        _is_server_listening = result == 0
														
 
															+        if _is_server_listening:
														
 
															+            print(f"server is listening on {server_fqdn}:{server_port}...\n")
														
 
															+        return _is_server_listening
														
 
															 def pid_exists(pid):
														
 
															     """Check whether pid exists in the current process table."""
														
 
															-    import errno
														
 
															     if pid < 0:
														
 
															         return False
														
 
															-    try:
														
 
															-        os.kill(pid, 0)
														
 
															-    except OSError as e:
														
 
															-        return e.errno == errno.EPERM
														
 
															+    if os.name == 'nt':
														
 
															+        output = subprocess.check_output(['TASKLIST', '/FI', f'pid eq {pid}']).decode()
														
 
															+        print(output)
														
 
															+        return "No tasks are running" not in output
														
 
															     else:
														
 
															-        return True
														
 
															+        try:
														
 
															+            os.kill(pid, 0)
														
 
															+        except OSError as e:
														
 
															+            return e.errno == errno.EPERM
														
 
															+        else:
														
 
															+            return True
														
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -47,7 +47,7 @@ Feature: llama.cpp server
 
															     Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
														
 
															     """
														
 
															     And   a completion request with no api error
														
 
															-    Then  64 tokens are predicted matching fun|Annaks|popcorns
														
 
															+    Then  64 tokens are predicted matching fun|Annaks|popcorns|pictry
														
 
															     And   the completion is  truncated
														
 
															     And   109 prompt tokens are processed
														
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -18,7 +18,7 @@ from huggingface_hub import hf_hub_download
 
															 from prometheus_client import parser
														
 
															-@step(u"a server listening on {server_fqdn}:{server_port}")
														
 
															+@step("a server listening on {server_fqdn}:{server_port}")
														
 
															 def step_server_config(context, server_fqdn, server_port):
														
 
															     context.server_fqdn = server_fqdn
														
 
															     context.server_port = int(server_port)
														
@@ -57,24 +57,24 @@ def step_server_config(context, server_fqdn, server_port):
 
															     context.prompts = []
														
 
															-@step(u'a model file {hf_file} from HF repo {hf_repo}')
														
 
															+@step('a model file {hf_file} from HF repo {hf_repo}')
														
 
															 def step_download_hf_model(context, hf_file, hf_repo):
														
 
															     context.model_file = hf_hub_download(repo_id=hf_repo, filename=hf_file)
														
 
															     if context.debug:
														
 
															         print(f"model file: {context.model_file}\n")
														
 
															-@step(u'a model alias {model_alias}')
														
 
															+@step('a model alias {model_alias}')
														
 
															 def step_model_alias(context, model_alias):
														
 
															     context.model_alias = model_alias
														
 
															-@step(u'{seed:d} as server seed')
														
 
															+@step('{seed:d} as server seed')
														
 
															 def step_seed(context, seed):
														
 
															     context.server_seed = seed
														
 
															-@step(u'{ngl:d} GPU offloaded layers')
														
 
															+@step('{ngl:d} GPU offloaded layers')
														
 
															 def step_n_gpu_layer(context, ngl):
														
 
															     if 'N_GPU_LAYERS' in os.environ:
														
 
															         new_ngl = int(os.environ['N_GPU_LAYERS'])
														
@@ -84,37 +84,37 @@ def step_n_gpu_layer(context, ngl):
 
															     context.n_gpu_layer = ngl
														
 
															-@step(u'{n_ctx:d} KV cache size')
														
 
															+@step('{n_ctx:d} KV cache size')
														
 
															 def step_n_ctx(context, n_ctx):
														
 
															     context.n_ctx = n_ctx
														
 
															-@step(u'{n_slots:d} slots')
														
 
															+@step('{n_slots:d} slots')
														
 
															 def step_n_slots(context, n_slots):
														
 
															     context.n_slots = n_slots
														
 
															-@step(u'{n_predict:d} server max tokens to predict')
														
 
															+@step('{n_predict:d} server max tokens to predict')
														
 
															 def step_server_n_predict(context, n_predict):
														
 
															     context.n_server_predict = n_predict
														
 
															-@step(u'continuous batching')
														
 
															+@step('continuous batching')
														
 
															 def step_server_continuous_batching(context):
														
 
															     context.server_continuous_batching = True
														
 
															-@step(u'embeddings extraction')
														
 
															+@step('embeddings extraction')
														
 
															 def step_server_embeddings(context):
														
 
															     context.server_embeddings = True
														
 
															-@step(u'prometheus compatible metrics exposed')
														
 
															+@step('prometheus compatible metrics exposed')
														
 
															 def step_server_metrics(context):
														
 
															     context.server_metrics = True
														
 
															-@step(u"the server is starting")
														
 
															+@step("the server is starting")
														
 
															 def step_start_server(context):
														
 
															     start_server_background(context)
														
 
															     attempts = 0
														
@@ -131,7 +131,7 @@ def step_start_server(context):
 
															             time.sleep(0.1)
														
 
															-@step(u"the server is {expecting_status}")
														
 
															+@step("the server is {expecting_status}")
														
 
															 @async_run_until_complete
														
 
															 async def step_wait_for_the_server_to_be_started(context, expecting_status):
														
 
															     match expecting_status:
														
@@ -160,7 +160,7 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status):
 
															             assert False, "unknown status"
														
 
															-@step(u'all slots are {expected_slot_status_string}')
														
 
															+@step('all slots are {expected_slot_status_string}')
														
 
															 @async_run_until_complete
														
 
															 async def step_all_slots_status(context, expected_slot_status_string):
														
 
															     match expected_slot_status_string:
														
@@ -176,7 +176,7 @@ async def step_all_slots_status(context, expected_slot_status_string):
 
															     await request_slots_status(context, expected_slots)
														
 
															-@step(u'a completion request with {api_error} api error')
														
 
															+@step('a completion request with {api_error} api error')
														
 
															 @async_run_until_complete
														
 
															 async def step_request_completion(context, api_error):
														
 
															     expect_api_error = api_error == 'raised'
														
@@ -194,133 +194,133 @@ async def step_request_completion(context, api_error):
 
															         assert completion == 401, f"completion must be an 401 status code: {completion}"
														
 
															-@step(u'{predicted_n:d} tokens are predicted matching {re_content}')
														
 
															+@step('{predicted_n:d} tokens are predicted matching {re_content}')
														
 
															 def step_n_tokens_predicted_with_content(context, predicted_n, re_content):
														
 
															     context.completion = context.tasks_result.pop()
														
 
															     assert_n_tokens_predicted(context.completion, predicted_n, re_content)
														
 
															-@step(u'{predicted_n:d} tokens are predicted')
														
 
															+@step('{predicted_n:d} tokens are predicted')
														
 
															 def step_n_tokens_predicted(context, predicted_n):
														
 
															     context.completion = context.tasks_result.pop()
														
 
															     assert_n_tokens_predicted(context.completion, predicted_n)
														
 
															-@step(u'the completion is  truncated')
														
 
															+@step('the completion is  truncated')
														
 
															 def step_assert_completion_truncated(context):
														
 
															     step_assert_completion_truncated(context, '')
														
 
															-@step(u'the completion is {truncated} truncated')
														
 
															+@step('the completion is {truncated} truncated')
														
 
															 def step_assert_completion_truncated(context, truncated):
														
 
															     truncated = truncated != "not"
														
 
															     assert context.completion['truncated'] == truncated, f'{context.completion}'
														
 
															-@step(u'{n_prompt:d} prompt tokens are processed')
														
 
															+@step('{n_prompt:d} prompt tokens are processed')
														
 
															 def step_impl(context, n_prompt):
														
 
															     assert n_prompt < 0 or n_prompt == context.completion['timings']['prompt_n'], f"n_prompt={context.completion['timings']['prompt_n']}"
														
 
															-@step(u'a user prompt {user_prompt}')
														
 
															+@step('a user prompt {user_prompt}')
														
 
															 def step_user_prompt(context, user_prompt):
														
 
															     context.prompts.append(user_prompt)
														
 
															     context.n_prompts = len(context.prompts)
														
 
															-@step(u'a system prompt {system_prompt}')
														
 
															+@step('a system prompt {system_prompt}')
														
 
															 def step_system_prompt(context, system_prompt):
														
 
															     context.system_prompt = system_prompt
														
 
															-@step(u'a model {model}')
														
 
															+@step('a model {model}')
														
 
															 def step_model(context, model):
														
 
															     context.model = model
														
 
															-@step(u'{max_tokens:d} max tokens to predict')
														
 
															+@step('{max_tokens:d} max tokens to predict')
														
 
															 def step_max_tokens(context, max_tokens):
														
 
															     context.n_predict = max_tokens
														
 
															-@step(u'streaming is {enable_streaming}')
														
 
															+@step('streaming is {enable_streaming}')
														
 
															 def step_streaming(context, enable_streaming):
														
 
															     context.enable_streaming = enable_streaming == 'enabled'
														
 
															-@step(u'a user api key {user_api_key}')
														
 
															+@step('a user api key {user_api_key}')
														
 
															 def step_user_api_key(context, user_api_key):
														
 
															     context.user_api_key = user_api_key
														
 
															-@step(u'no user api key')
														
 
															+@step('no user api key')
														
 
															 def step_no_user_api_key(context):
														
 
															     context.user_api_key = None
														
 
															-@step(u'a user api key ')
														
 
															+@step('a user api key ')
														
 
															 def step_no_user_api_key_space(context):
														
 
															     context.user_api_key = None
														
 
															-@step(u'a server api key {server_api_key}')
														
 
															+@step('a server api key {server_api_key}')
														
 
															 def step_server_api_key(context, server_api_key):
														
 
															     context.server_api_key = server_api_key
														
 
															-@step(u'{n_junk:d} as number of junk')
														
 
															+@step('{n_junk:d} as number of junk')
														
 
															 def step_n_junk(context, n_junk):
														
 
															     context.n_junk = n_junk
														
 
															-@step(u'{n_batch:d} as batch size')
														
 
															+@step('{n_batch:d} as batch size')
														
 
															 def step_n_batch(context, n_batch):
														
 
															     context.n_batch = n_batch
														
 
															-@step(u'{seed:d} as seed')
														
 
															+@step('{seed:d} as seed')
														
 
															 def step_seed(context, seed):
														
 
															     context.seed = seed
														
 
															-@step(u'a prefix prompt')
														
 
															+@step('a prefix prompt')
														
 
															 def step_prompt_prefix(context):
														
 
															-    context.prompt_prefix = context.text
														
 
															+    context.prompt_prefix = context_text(context)
														
 
															-@step(u'a junk suffix prompt')
														
 
															+@step('a junk suffix prompt')
														
 
															 def step_prompt_junk_suffix(context):
														
 
															-    context.prompt_junk_suffix = context.text
														
 
															+    context.prompt_junk_suffix = context_text(context)
														
 
															-@step(u'a suffix prompt')
														
 
															+@step('a suffix prompt')
														
 
															 def step_prompt_suffix(context):
														
 
															-    context.prompt_suffix = context.text
														
 
															+    context.prompt_suffix = context_text(context)
														
 
															-@step(u'{n_ga:d} group attention factor'
														
 
															-      u' to extend context size through self-extend')
														
 
															+@step('{n_ga:d} group attention factor'
														
 
															+      ' to extend context size through self-extend')
														
 
															 def step_impl(context, n_ga):
														
 
															     context.n_ga = n_ga
														
 
															-@step(u'{n_ga_w:d} group attention width to extend context size through self-extend')
														
 
															+@step('{n_ga_w:d} group attention width to extend context size through self-extend')
														
 
															 def step_impl(context, n_ga_w):
														
 
															     context.n_ga_w = n_ga_w
														
 
															-@step(u'a passkey prompt template')
														
 
															+@step('a passkey prompt template')
														
 
															 def step_prompt_passkey(context):
														
 
															-    context.prompt_passkey = context.text
														
 
															+    context.prompt_passkey = context_text(context)
														
 
															-@step(u'{n_prompts:d} fixed prompts')
														
 
															+@step('{n_prompts:d} fixed prompts')
														
 
															 def step_fixed_prompts(context, n_prompts):
														
 
															     context.prompts.extend([str(0)*(context.n_batch if context.n_batch is not None else 512) for i in range(n_prompts)])
														
 
															     context.n_prompts = n_prompts
														
 
															-@step(u'a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk')
														
 
															+@step('a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk')
														
 
															 def step_prompt_passkey(context, passkey, i_pos):
														
 
															     prompt = ""
														
 
															     for i in range(context.n_junk):
														
@@ -334,7 +334,7 @@ def step_prompt_passkey(context, passkey, i_pos):
 
															     context.n_prompts = len(context.prompts)
														
 
															-@step(u'an OAI compatible chat completions request with {api_error} api error')
														
 
															+@step('an OAI compatible chat completions request with {api_error} api error')
														
 
															 @async_run_until_complete
														
 
															 async def step_oai_chat_completions(context, api_error):
														
 
															     if context.debug:
														
@@ -369,19 +369,19 @@ async def step_oai_chat_completions(context, api_error):
 
															         print(f"Completion response: {completion}")
														
 
															-@step(u'a prompt')
														
 
															+@step('a prompt')
														
 
															 def step_a_prompt(context):
														
 
															-    context.prompts.append(context.text)
														
 
															+    context.prompts.append(context_text(context))
														
 
															     context.n_prompts = len(context.prompts)
														
 
															-@step(u'a prompt {prompt}')
														
 
															+@step('a prompt {prompt}')
														
 
															 def step_a_prompt_prompt(context, prompt):
														
 
															     context.prompts.append(prompt)
														
 
															     context.n_prompts = len(context.prompts)
														
 
															-@step(u'concurrent completion requests')
														
 
															+@step('concurrent completion requests')
														
 
															 @async_run_until_complete()
														
 
															 async def step_concurrent_completion_requests(context):
														
 
															     await concurrent_requests(context,
														
@@ -397,7 +397,7 @@ async def step_concurrent_completion_requests(context):
 
															                                                                            'user_api_key') else None)
														
 
															-@step(u'concurrent OAI completions requests')
														
 
															+@step('concurrent OAI completions requests')
														
 
															 @async_run_until_complete
														
 
															 async def step_oai_chat_completions(context):
														
 
															     await concurrent_requests(context, oai_chat_completions,
														
@@ -417,7 +417,7 @@ async def step_oai_chat_completions(context):
 
															                               if hasattr(context, 'user_api_key') else None)
														
 
															-@step(u'concurrent OAI completions requests no v1')
														
 
															+@step('concurrent OAI completions requests no v1')
														
 
															 @async_run_until_complete
														
 
															 async def step_oai_chat_completions(context):
														
 
															     await concurrent_requests(context, oai_chat_completions,
														
@@ -440,13 +440,13 @@ async def step_oai_chat_completions(context):
 
															                               if hasattr(context, 'user_api_key') else None)
														
 
															-@step(u'all prompts are predicted')
														
 
															+@step('all prompts are predicted')
														
 
															 @async_run_until_complete
														
 
															 async def step_all_prompts_are_predicted(context):
														
 
															     await all_prompts_are_predicted(context)
														
 
															-@step(u'all prompts are predicted with {n_expected_predicted:d} tokens')
														
 
															+@step('all prompts are predicted with {n_expected_predicted:d} tokens')
														
 
															 @async_run_until_complete
														
 
															 async def step_all_prompts_are_predicted_with_n_tokens(context, n_expected_predicted):
														
 
															     await all_prompts_are_predicted(context, n_expected_predicted)
														
@@ -460,14 +460,14 @@ async def all_prompts_are_predicted(context, expected_predicted_n=None):
 
															     assert len(context.concurrent_tasks) == 0, f"{len(context.concurrent_tasks)} pending requests"
														
 
															-@step(u'embeddings are computed for')
														
 
															+@step('embeddings are computed for')
														
 
															 @async_run_until_complete
														
 
															 async def step_compute_embedding(context):
														
 
															     context.n_prompts = 1
														
 
															-    context.embeddings = await request_embedding(context.text, base_url=context.base_url)
														
 
															+    context.embeddings = await request_embedding(context_text(context), base_url=context.base_url)
														
 
															-@step(u'all embeddings are the same')
														
 
															+@step('all embeddings are the same')
														
 
															 @async_run_until_complete
														
 
															 async def step_all_embeddings_are_the_same(context):
														
 
															     n_embedding_requests = await gather_tasks_results(context)
														
@@ -491,7 +491,8 @@ async def step_all_embeddings_are_the_same(context):
 
															                 print(f"{msg}\n")
														
 
															             assert np.isclose(similarity, 1.0, rtol=1e-05, atol=1e-08, equal_nan=False), msg
														
 
															-@step(u'embeddings are generated')
														
 
															+
														
 
															+@step('embeddings are generated')
														
 
															 def step_assert_embeddings(context):
														
 
															     assert context.n_prompts == len(context.embeddings), (f"unexpected response:\n"
														
 
															                                                              f"context.n_prompts={context.n_prompts}\n"
														
@@ -500,17 +501,17 @@ def step_assert_embeddings(context):
 
															         assert_embeddings(embedding)
														
 
															-@step(u'an OAI compatible embeddings computation request for')
														
 
															+@step('an OAI compatible embeddings computation request for')
														
 
															 @async_run_until_complete
														
 
															 async def step_oai_compute_embeddings(context):
														
 
															     context.n_prompts = 1
														
 
															-    context.embeddings = await request_oai_embeddings(context.text,
														
 
															+    context.embeddings = await request_oai_embeddings(context_text(context),
														
 
															                                                       base_url=context.base_url,
														
 
															                                                       user_api_key=context.user_api_key,
														
 
															                                                       model=context.model)
														
 
															-@step(u'an OAI compatible embeddings computation request for multiple inputs')
														
 
															+@step('an OAI compatible embeddings computation request for multiple inputs')
														
 
															 @async_run_until_complete
														
 
															 async def step_oai_compute_embeddings_multiple_inputs(context):
														
 
															     context.embeddings = await request_oai_embeddings(context.prompts,
														
@@ -520,7 +521,7 @@ async def step_oai_compute_embeddings_multiple_inputs(context):
 
															     context.prompts.clear()
														
 
															-@step(u'concurrent embedding requests')
														
 
															+@step('concurrent embedding requests')
														
 
															 @async_run_until_complete()
														
 
															 async def step_concurrent_embedding_requests(context):
														
 
															     await concurrent_requests(context,
														
@@ -529,7 +530,7 @@ async def step_concurrent_embedding_requests(context):
 
															                               base_url=context.base_url)
														
 
															-@step(u'concurrent OAI embedding requests')
														
 
															+@step('concurrent OAI embedding requests')
														
 
															 @async_run_until_complete()
														
 
															 async def step_concurrent_oai_embedding_requests(context):
														
 
															     await concurrent_requests(context,
														
@@ -540,7 +541,7 @@ async def step_concurrent_oai_embedding_requests(context):
 
															                               model=context.model)
														
 
															-@step(u'all embeddings are generated')
														
 
															+@step('all embeddings are generated')
														
 
															 @async_run_until_complete()
														
 
															 async def all_embeddings_are_generated(context):
														
 
															     n_embedding_requests = await gather_tasks_results(context)
														
@@ -549,10 +550,10 @@ async def all_embeddings_are_generated(context):
 
															         assert_embeddings(context.tasks_result.pop().pop())
														
 
															-@step(u'tokenizing')
														
 
															+@step('tokenizing')
														
 
															 @async_run_until_complete
														
 
															 async def step_tokenize(context):
														
 
															-    context.tokenized_text = context.text
														
 
															+    context.tokenized_text = context_text(context)
														
 
															     async with aiohttp.ClientSession() as session:
														
 
															         async with session.post(f'{context.base_url}/tokenize',
														
 
															                                 json={
														
@@ -563,7 +564,7 @@ async def step_tokenize(context):
 
															             context.tokens = tokenize_json['tokens']
														
 
															-@step(u'tokens can be detokenize')
														
 
															+@step('tokens can be detokenize')
														
 
															 @async_run_until_complete
														
 
															 async def step_detokenize(context):
														
 
															     assert len(context.tokens) > 0
														
@@ -578,7 +579,7 @@ async def step_detokenize(context):
 
															             assert context.tokenized_text == detokenize_json['content'].strip()
														
 
															-@step(u'an OPTIONS request is sent from {origin}')
														
 
															+@step('an OPTIONS request is sent from {origin}')
														
 
															 @async_run_until_complete
														
 
															 async def step_options_request(context, origin):
														
 
															     async with aiohttp.ClientSession() as session:
														
@@ -589,12 +590,12 @@ async def step_options_request(context, origin):
 
															             context.options_response = response
														
 
															-@step(u'CORS header {cors_header} is set to {cors_header_value}')
														
 
															+@step('CORS header {cors_header} is set to {cors_header_value}')
														
 
															 def step_check_options_header_value(context, cors_header, cors_header_value):
														
 
															     assert context.options_response.headers[cors_header] == cors_header_value
														
 
															-@step(u'prometheus metrics are exposed')
														
 
															+@step('prometheus metrics are exposed')
														
 
															 @async_run_until_complete
														
 
															 async def step_prometheus_metrics_exported(context):
														
 
															     async with aiohttp.ClientSession() as session:
														
@@ -616,14 +617,14 @@ async def step_prometheus_metrics_exported(context):
 
															             assert metric_exported, "No metrics exported"
														
 
															-@step(u'metric {metric_name} is {metric_value:d}')
														
 
															+@step('metric {metric_name} is {metric_value:d}')
														
 
															 def step_assert_metric_value(context, metric_name, metric_value):
														
 
															     if metric_name not in context.metrics:
														
 
															         assert False, f"no metric {metric_name} in {context.metrics.keys()}"
														
 
															     assert context.metrics[metric_name].samples[0].value == metric_value, f"metric: {context.metrics[metric_name]}"
														
 
															-@step(u'available models')
														
 
															+@step('available models')
														
 
															 def step_available_models(context):
														
 
															     # openai client always expects an api_key
														
 
															     openai.api_key = context.user_api_key if context.user_api_key is not None else 'nope'
														
@@ -631,14 +632,14 @@ def step_available_models(context):
 
															     context.models = openai.Model.list().data
														
 
															-@step(u'{n_model:d} models are supported')
														
 
															+@step('{n_model:d} models are supported')
														
 
															 def step_supported_models(context, n_model):
														
 
															     if context.debug:
														
 
															         print("server models available:", context.models)
														
 
															     assert len(context.models) == n_model
														
 
															-@step(u'model {i_model:d} is {param} {preposition} {param_value}')
														
 
															+@step('model {i_model:d} is {param} {preposition} {param_value}')
														
 
															 def step_supported_models(context, i_model, param, preposition, param_value):
														
 
															     assert i_model < len(context.models)
														
 
															     model = context.models[i_model]
														
@@ -1007,12 +1008,22 @@ async def completions_seed(context):
 
															         else context.server_seed if hasattr(context, 'server_seed') else None
														
 
															+def context_text(context):
														
 
															+    return context.text.replace('\r', '')
														
 
															+
														
 
															+
														
 
															 def start_server_background(context):
														
 
															-    context.server_path = '../../../build/bin/server'
														
 
															+    if os.name == 'nt':
														
 
															+        context.server_path = '../../../build/bin/Release/server.exe'
														
 
															+    else:
														
 
															+        context.server_path = '../../../build/bin/server'
														
 
															     if 'LLAMA_SERVER_BIN_PATH' in os.environ:
														
 
															         context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
														
 
															+    server_listen_addr = context.server_fqdn
														
 
															+    if os.name == 'nt':
														
 
															+        server_listen_addr = '0.0.0.0'
														
 
															     server_args = [
														
 
															-        '--host', context.server_fqdn,
														
 
															+        '--host', server_listen_addr,
														
 
															         '--port', context.server_port,
														
 
															         '--model', context.model_file
														
 
															     ]
														
@@ -1045,7 +1056,16 @@ def start_server_background(context):
 
															     if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
														
 
															         server_args.extend(['--log-format', "text"])
														
 
															     print(f"starting server with: {context.server_path} {server_args}\n")
														
 
															+    flags = 0
														
 
															+    if 'nt' == os.name:
														
 
															+        flags |= subprocess.DETACHED_PROCESS
														
 
															+        flags |= subprocess.CREATE_NEW_PROCESS_GROUP
														
 
															+        flags |= subprocess.CREATE_NO_WINDOW
														
 
															+
														
 
															+    pkwargs = {
														
 
															+        'creationflags': flags,
														
 
															+    }
														
 
															     context.server_process = subprocess.Popen(
														
 
															         [str(arg) for arg in [context.server_path, *server_args]],
														
 
															-        close_fds=True)
														
 
															-    print(f"server pid={context.server_process.pid}")
														
 
															+        **pkwargs)
														
 
															+    print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")