Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions autotest/interface/restful/test_restful_chat_completions_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ def test_get_model(self, config, backend, model_case):
def test_encode_s1(self, backend, model_case):
api_client = APIClient(BASE_URL)
input_ids1, length1 = api_client.encode('Hi, pls intro yourself')
input_ids2, length2 = api_client.encode('Hi, pls intro yourself', add_bos=False)
input_ids2, length2 = api_client.encode('Hi, pls intro yourself')
input_ids3, length3 = api_client.encode('Hi, pls intro yourself', do_preprocess=True)
input_ids4, length4 = api_client.encode('Hi, pls intro yourself', do_preprocess=True, add_bos=False)
input_ids5, length5 = api_client.encode('Hi, pls intro yourself' * 100, add_bos=False)
input_ids4, length4 = api_client.encode('Hi, pls intro yourself', do_preprocess=True)
input_ids5, length5 = api_client.encode('Hi, pls intro yourself' * 100)

assert len(input_ids1) == length1 and length1 > 0
assert len(input_ids2) == length2 and length2 > 0
Expand All @@ -64,10 +64,10 @@ def test_encode_s1(self, backend, model_case):
def test_encode(self, backend, model_case):
api_client = APIClient(BASE_URL)
input_ids1, length1 = api_client.encode('Hi, pls intro yourself')
input_ids2, length2 = api_client.encode('Hi, pls intro yourself', add_bos=False)
input_ids2, length2 = api_client.encode('Hi, pls intro yourself')
input_ids3, length3 = api_client.encode('Hi, pls intro yourself', do_preprocess=True)
input_ids4, length4 = api_client.encode('Hi, pls intro yourself', do_preprocess=True, add_bos=False)
input_ids5, length5 = api_client.encode('Hi, pls intro yourself' * 100, add_bos=False)
input_ids4, length4 = api_client.encode('Hi, pls intro yourself', do_preprocess=True)
input_ids5, length5 = api_client.encode('Hi, pls intro yourself' * 100)

assert len(input_ids1) == length1 and length1 > 0
assert len(input_ids2) == length2 and length2 > 0
Expand Down Expand Up @@ -537,7 +537,7 @@ def test_ignore_eos_streaming(self, backend, model_case):
for index in range(0, len(outputList) - 1):
assert_chat_completions_stream_return(outputList[index], model_name)
response += get_chat_delta_text(outputList[index].get('choices')[0])
length = api_client.encode(response, add_bos=False)[1]
length = api_client.encode(response)[1]
assert outputList[-1].get('choices')[0].get('finish_reason') == 'length'
assert length >= 99 and length <= 101

Expand Down Expand Up @@ -623,7 +623,7 @@ def __test_max_tokens_streaming_or_max_completion_tokens_streaming(
for index in range(0, len(outputList) - 1):
assert_chat_completions_stream_return(outputList[index], model_name)
response += get_chat_delta_text(outputList[index].get('choices')[0])
length = api_client.encode(response, add_bos=False)[1]
length = api_client.encode(response)[1]
assert outputList[-1].get('choices')[0].get('finish_reason') == 'length'
assert length == 5 or length == 6

Expand Down Expand Up @@ -676,7 +676,7 @@ def test_logprobs_streaming(self, backend, model_case):
for index in range(0, len(outputList) - 1):
assert_chat_completions_stream_return(outputList[index], model_name, check_logprobs=True, logprobs_num=10)
response += get_chat_delta_text(outputList[index].get('choices')[0])
length = api_client.encode(response, add_bos=False)[1]
length = api_client.encode(response)[1]
assert outputList[-1].get('choices')[0].get('finish_reason') == 'length'
assert length == 5 or length == 6

Expand Down Expand Up @@ -1026,7 +1026,7 @@ def test_max_tokens_streaming(self, backend, model_case):
assert_chat_completions_stream_return(outputList[index], model_name)
response += get_chat_delta_text(outputList[index].get('choices')[0])
api_client = APIClient(BASE_URL)
length = api_client.encode(response, add_bos=False)[1]
length = api_client.encode(response)[1]
assert outputList[-1].get('choices')[0].get('finish_reason') == 'length'
assert length == 5 or length == 6

Expand Down Expand Up @@ -1080,7 +1080,7 @@ def test_logprobs_streaming(self, backend, model_case):
assert_chat_completions_stream_return(outputList[index], model_name, check_logprobs=True, logprobs_num=10)
response += get_chat_delta_text(outputList[index].get('choices')[0])
api_client = APIClient(BASE_URL)
length = api_client.encode(response, add_bos=False)[1]
length = api_client.encode(response)[1]
assert outputList[-1].get('choices')[0].get('finish_reason') == 'length'
assert length == 5 or length == 6

Expand Down
6 changes: 3 additions & 3 deletions autotest/interface/restful/test_restful_completions_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@ def test_encode(self, backend, model_case):
print(f'[test_encode] backend={backend!r} model_case={model_case!r}')
api_client = APIClient(BASE_URL)
input_ids1, length1 = api_client.encode('Hi, pls intro yourself')
input_ids2, length2 = api_client.encode('Hi, pls intro yourself', add_bos=False)
input_ids2, length2 = api_client.encode('Hi, pls intro yourself')
input_ids3, length3 = api_client.encode('Hi, pls intro yourself', do_preprocess=True)
input_ids4, length4 = api_client.encode('Hi, pls intro yourself', do_preprocess=True, add_bos=False)
input_ids5, length5 = api_client.encode('Hi, pls intro yourself' * 100, add_bos=False)
input_ids4, length4 = api_client.encode('Hi, pls intro yourself', do_preprocess=True)
input_ids5, length5 = api_client.encode('Hi, pls intro yourself' * 100)
assert len(input_ids1) == length1 and length1 > 0
assert len(input_ids2) == length2 and length2 > 0
assert len(input_ids3) == length3 and length3 > 0
Expand Down
2 changes: 1 addition & 1 deletion autotest/interface/restful/test_restful_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -944,7 +944,7 @@ def test_skip_special_tokens(self, config):
def test_stop_token_ids(self):
print(f'\n[Model: {self.model_name}] Running stop_token_ids test')
api_client = APIClient(BASE_URL)
input_ids1, length1 = api_client.encode('.', add_bos=False)
input_ids1, length1 = api_client.encode('.')
print(f'input_ids1={input_ids1}, length1={length1}')

payload = {
Expand Down
6 changes: 0 additions & 6 deletions benchmark/profile_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,6 @@ async def _inference(self, req_queue: Queue, session_id: int, temperature: float
top_p=top_p,
top_k=top_k,
ignore_eos=True),
sequence_start=True,
sequence_end=True,
stream_output=stream_output)
try:
async for outputs in generator:
Expand All @@ -195,10 +193,6 @@ async def _inference(self, req_queue: Queue, session_id: int, temperature: float
finally:
await generator.aclose()

# for pytorch engine to restart a session
if self.backend == 'pytorch':
await model_inst.async_end(session_id)

self.pbar.update(1)
session_id += concurrency

Expand Down
5 changes: 1 addition & 4 deletions lmdeploy/cli/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,6 @@ def input_prompt():

def build_pipe(model_path, backend, trust_remote_code=False, **kwargs):
engine_config = None
if kwargs.get('enable_prefix_caching', False):
print('interactive chat cannot be used when prefix caching is enabled')
exit(-1)
if backend == 'turbomind':
engine_config = TurbomindEngineConfig()
for key, value in kwargs.items():
Expand Down Expand Up @@ -86,7 +83,7 @@ def main(model_path, backend, trust_remote_code=False, **kwargs):
quit = True
break
if prompt == 'end':
sess.close()
sess.reset()
break
if prompt == 'exit':
quit = True
Expand Down
1 change: 0 additions & 1 deletion lmdeploy/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,7 +529,6 @@ class ResponseType(enum.Enum):
INPUT_LENGTH_ERROR = enum.auto()
INTERNAL_ENGINE_ERROR = enum.auto()
CANCEL = enum.auto()
PREFIX_CACHE_CONFLICT_INTERACTIVE_MODE = enum.auto()
NO_QUEUE = enum.auto()


Expand Down
Loading
Loading