|
@@ -64,6 +64,7 @@ class ServerProcess:
|
|
|
server_embeddings: bool | None = False
|
|
server_embeddings: bool | None = False
|
|
|
server_reranking: bool | None = False
|
|
server_reranking: bool | None = False
|
|
|
server_metrics: bool | None = False
|
|
server_metrics: bool | None = False
|
|
|
|
|
+ server_slots: bool | None = False
|
|
|
draft: int | None = None
|
|
draft: int | None = None
|
|
|
api_key: str | None = None
|
|
api_key: str | None = None
|
|
|
response_format: str | None = None
|
|
response_format: str | None = None
|
|
@@ -91,7 +92,6 @@ class ServerProcess:
|
|
|
else:
|
|
else:
|
|
|
server_path = "../../../build/bin/llama-server"
|
|
server_path = "../../../build/bin/llama-server"
|
|
|
server_args = [
|
|
server_args = [
|
|
|
- "--slots", # requires to get slot status via /slots endpoint
|
|
|
|
|
"--host",
|
|
"--host",
|
|
|
self.server_host,
|
|
self.server_host,
|
|
|
"--port",
|
|
"--port",
|
|
@@ -129,6 +129,8 @@ class ServerProcess:
|
|
|
server_args.append("--reranking")
|
|
server_args.append("--reranking")
|
|
|
if self.server_metrics:
|
|
if self.server_metrics:
|
|
|
server_args.append("--metrics")
|
|
server_args.append("--metrics")
|
|
|
|
|
+ if self.server_slots:
|
|
|
|
|
+ server_args.append("--slots")
|
|
|
if self.model_alias:
|
|
if self.model_alias:
|
|
|
server_args.extend(["--alias", self.model_alias])
|
|
server_args.extend(["--alias", self.model_alias])
|
|
|
if self.n_ctx:
|
|
if self.n_ctx:
|
|
@@ -181,7 +183,7 @@ class ServerProcess:
|
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
|
while time.time() - start_time < timeout_seconds:
|
|
while time.time() - start_time < timeout_seconds:
|
|
|
try:
|
|
try:
|
|
|
- response = self.make_request("GET", "/slots", headers={
|
|
|
|
|
|
|
+ response = self.make_request("GET", "/health", headers={
|
|
|
"Authorization": f"Bearer {self.api_key}" if self.api_key else None
|
|
"Authorization": f"Bearer {self.api_key}" if self.api_key else None
|
|
|
})
|
|
})
|
|
|
if response.status_code == 200:
|
|
if response.status_code == 200:
|
|
@@ -224,7 +226,7 @@ class ServerProcess:
|
|
|
result.headers = dict(response.headers)
|
|
result.headers = dict(response.headers)
|
|
|
result.status_code = response.status_code
|
|
result.status_code = response.status_code
|
|
|
result.body = response.json() if parse_body else None
|
|
result.body = response.json() if parse_body else None
|
|
|
- print("Response from server", result.body)
|
|
|
|
|
|
|
+ print("Response from server", json.dumps(result.body, indent=2))
|
|
|
return result
|
|
return result
|
|
|
|
|
|
|
|
def make_stream_request(
|
|
def make_stream_request(
|
|
@@ -245,7 +247,7 @@ class ServerProcess:
|
|
|
break
|
|
break
|
|
|
elif line.startswith('data: '):
|
|
elif line.startswith('data: '):
|
|
|
data = json.loads(line[6:])
|
|
data = json.loads(line[6:])
|
|
|
- print("Partial response from server", data)
|
|
|
|
|
|
|
+ print("Partial response from server", json.dumps(data, indent=2))
|
|
|
yield data
|
|
yield data
|
|
|
|
|
|
|
|
|
|
|