|
@@ -77,10 +77,10 @@ def test_different_draft_min_draft_max():
|
|
|
|
|
|
|
|
def test_slot_ctx_not_exceeded():
|
|
def test_slot_ctx_not_exceeded():
|
|
|
global server
|
|
global server
|
|
|
- server.n_ctx = 64
|
|
|
|
|
|
|
+ server.n_ctx = 256
|
|
|
server.start()
|
|
server.start()
|
|
|
res = server.make_request("POST", "/completion", data={
|
|
res = server.make_request("POST", "/completion", data={
|
|
|
- "prompt": "Hello " * 56,
|
|
|
|
|
|
|
+ "prompt": "Hello " * 248,
|
|
|
"temperature": 0.0,
|
|
"temperature": 0.0,
|
|
|
"top_k": 1,
|
|
"top_k": 1,
|
|
|
"speculative.p_min": 0.0,
|
|
"speculative.p_min": 0.0,
|
|
@@ -91,19 +91,19 @@ def test_slot_ctx_not_exceeded():
|
|
|
|
|
|
|
|
def test_with_ctx_shift():
|
|
def test_with_ctx_shift():
|
|
|
global server
|
|
global server
|
|
|
- server.n_ctx = 64
|
|
|
|
|
|
|
+ server.n_ctx = 256
|
|
|
server.enable_ctx_shift = True
|
|
server.enable_ctx_shift = True
|
|
|
server.start()
|
|
server.start()
|
|
|
res = server.make_request("POST", "/completion", data={
|
|
res = server.make_request("POST", "/completion", data={
|
|
|
- "prompt": "Hello " * 56,
|
|
|
|
|
|
|
+ "prompt": "Hello " * 248,
|
|
|
"temperature": 0.0,
|
|
"temperature": 0.0,
|
|
|
"top_k": 1,
|
|
"top_k": 1,
|
|
|
- "n_predict": 64,
|
|
|
|
|
|
|
+ "n_predict": 256,
|
|
|
"speculative.p_min": 0.0,
|
|
"speculative.p_min": 0.0,
|
|
|
})
|
|
})
|
|
|
assert res.status_code == 200
|
|
assert res.status_code == 200
|
|
|
assert len(res.body["content"]) > 0
|
|
assert len(res.body["content"]) > 0
|
|
|
- assert res.body["tokens_predicted"] == 64
|
|
|
|
|
|
|
+ assert res.body["tokens_predicted"] == 256
|
|
|
assert res.body["truncated"] == True
|
|
assert res.body["truncated"] == True
|
|
|
|
|
|
|
|
|
|
|