InternLM · lvhan028 · Jun 22, 2026 · Jul 2, 2026 · Jul 2, 2026 · Jul 2, 2026
diff --git a/autotest/interface/restful/test_restful_chat_completions_v1.py b/autotest/interface/restful/test_restful_chat_completions_v1.py
@@ -44,10 +44,10 @@ def test_get_model(self, config, backend, model_case):
     def test_encode_s1(self, backend, model_case):
         api_client = APIClient(BASE_URL)
         input_ids1, length1 = api_client.encode('Hi, pls intro yourself')
-        input_ids2, length2 = api_client.encode('Hi, pls intro yourself', add_bos=False)
+        input_ids2, length2 = api_client.encode('Hi, pls intro yourself')
         input_ids3, length3 = api_client.encode('Hi, pls intro yourself', do_preprocess=True)
-        input_ids4, length4 = api_client.encode('Hi, pls intro yourself', do_preprocess=True, add_bos=False)
-        input_ids5, length5 = api_client.encode('Hi, pls intro yourself' * 100, add_bos=False)
+        input_ids4, length4 = api_client.encode('Hi, pls intro yourself', do_preprocess=True)
+        input_ids5, length5 = api_client.encode('Hi, pls intro yourself' * 100)
 
         assert len(input_ids1) == length1 and length1 > 0
         assert len(input_ids2) == length2 and length2 > 0
@@ -64,10 +64,10 @@ def test_encode_s1(self, backend, model_case):
     def test_encode(self, backend, model_case):
         api_client = APIClient(BASE_URL)
         input_ids1, length1 = api_client.encode('Hi, pls intro yourself')
-        input_ids2, length2 = api_client.encode('Hi, pls intro yourself', add_bos=False)
+        input_ids2, length2 = api_client.encode('Hi, pls intro yourself')
         input_ids3, length3 = api_client.encode('Hi, pls intro yourself', do_preprocess=True)
-        input_ids4, length4 = api_client.encode('Hi, pls intro yourself', do_preprocess=True, add_bos=False)
-        input_ids5, length5 = api_client.encode('Hi, pls intro yourself' * 100, add_bos=False)
+        input_ids4, length4 = api_client.encode('Hi, pls intro yourself', do_preprocess=True)
+        input_ids5, length5 = api_client.encode('Hi, pls intro yourself' * 100)
 
         assert len(input_ids1) == length1 and length1 > 0
         assert len(input_ids2) == length2 and length2 > 0
@@ -537,7 +537,7 @@ def test_ignore_eos_streaming(self, backend, model_case):
         for index in range(0, len(outputList) - 1):
             assert_chat_completions_stream_return(outputList[index], model_name)
             response += get_chat_delta_text(outputList[index].get('choices')[0])
-        length = api_client.encode(response, add_bos=False)[1]
+        length = api_client.encode(response)[1]
         assert outputList[-1].get('choices')[0].get('finish_reason') == 'length'
         assert length >= 99 and length <= 101
 
@@ -623,7 +623,7 @@ def __test_max_tokens_streaming_or_max_completion_tokens_streaming(
         for index in range(0, len(outputList) - 1):
             assert_chat_completions_stream_return(outputList[index], model_name)
             response += get_chat_delta_text(outputList[index].get('choices')[0])
-        length = api_client.encode(response, add_bos=False)[1]
+        length = api_client.encode(response)[1]
         assert outputList[-1].get('choices')[0].get('finish_reason') == 'length'
         assert length == 5 or length == 6
 
@@ -676,7 +676,7 @@ def test_logprobs_streaming(self, backend, model_case):
         for index in range(0, len(outputList) - 1):
             assert_chat_completions_stream_return(outputList[index], model_name, check_logprobs=True, logprobs_num=10)
             response += get_chat_delta_text(outputList[index].get('choices')[0])
-        length = api_client.encode(response, add_bos=False)[1]
+        length = api_client.encode(response)[1]
         assert outputList[-1].get('choices')[0].get('finish_reason') == 'length'
         assert length == 5 or length == 6
 
@@ -1026,7 +1026,7 @@ def test_max_tokens_streaming(self, backend, model_case):
             assert_chat_completions_stream_return(outputList[index], model_name)
             response += get_chat_delta_text(outputList[index].get('choices')[0])
         api_client = APIClient(BASE_URL)
-        length = api_client.encode(response, add_bos=False)[1]
+        length = api_client.encode(response)[1]
         assert outputList[-1].get('choices')[0].get('finish_reason') == 'length'
         assert length == 5 or length == 6
 
@@ -1080,7 +1080,7 @@ def test_logprobs_streaming(self, backend, model_case):
             assert_chat_completions_stream_return(outputList[index], model_name, check_logprobs=True, logprobs_num=10)
             response += get_chat_delta_text(outputList[index].get('choices')[0])
         api_client = APIClient(BASE_URL)
-        length = api_client.encode(response, add_bos=False)[1]
+        length = api_client.encode(response)[1]
         assert outputList[-1].get('choices')[0].get('finish_reason') == 'length'
         assert length == 5 or length == 6
 

diff --git a/autotest/interface/restful/test_restful_completions_v1.py b/autotest/interface/restful/test_restful_completions_v1.py
@@ -28,10 +28,10 @@ def test_encode(self, backend, model_case):
         print(f'[test_encode] backend={backend!r} model_case={model_case!r}')
         api_client = APIClient(BASE_URL)
         input_ids1, length1 = api_client.encode('Hi, pls intro yourself')
-        input_ids2, length2 = api_client.encode('Hi, pls intro yourself', add_bos=False)
+        input_ids2, length2 = api_client.encode('Hi, pls intro yourself')
         input_ids3, length3 = api_client.encode('Hi, pls intro yourself', do_preprocess=True)
-        input_ids4, length4 = api_client.encode('Hi, pls intro yourself', do_preprocess=True, add_bos=False)
-        input_ids5, length5 = api_client.encode('Hi, pls intro yourself' * 100, add_bos=False)
+        input_ids4, length4 = api_client.encode('Hi, pls intro yourself', do_preprocess=True)
+        input_ids5, length5 = api_client.encode('Hi, pls intro yourself' * 100)
         assert len(input_ids1) == length1 and length1 > 0
         assert len(input_ids2) == length2 and length2 > 0
         assert len(input_ids3) == length3 and length3 > 0

diff --git a/autotest/interface/restful/test_restful_generate.py b/autotest/interface/restful/test_restful_generate.py
@@ -944,7 +944,7 @@ def test_skip_special_tokens(self, config):
     def test_stop_token_ids(self):
         print(f'\n[Model: {self.model_name}] Running stop_token_ids test')
         api_client = APIClient(BASE_URL)
-        input_ids1, length1 = api_client.encode('.', add_bos=False)
+        input_ids1, length1 = api_client.encode('.')
         print(f'input_ids1={input_ids1}, length1={length1}')
 
         payload = {

diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
@@ -179,8 +179,6 @@ async def _inference(self, req_queue: Queue, session_id: int, temperature: float
                                                                                   top_p=top_p,
                                                                                   top_k=top_k,
                                                                                   ignore_eos=True),
-                                                      sequence_start=True,
-                                                      sequence_end=True,
                                                       stream_output=stream_output)
             try:
                 async for outputs in generator:
@@ -195,10 +193,6 @@ async def _inference(self, req_queue: Queue, session_id: int, temperature: float
             finally:
                 await generator.aclose()
 
-            # for pytorch engine to restart a session
-            if self.backend == 'pytorch':
-                await model_inst.async_end(session_id)
-
             self.pbar.update(1)
             session_id += concurrency
 

diff --git a/lmdeploy/cli/chat.py b/lmdeploy/cli/chat.py
@@ -16,9 +16,6 @@ def input_prompt():
 
 def build_pipe(model_path, backend, trust_remote_code=False, **kwargs):
     engine_config = None
-    if kwargs.get('enable_prefix_caching', False):
-        print('interactive chat cannot be used when prefix caching is enabled')
-        exit(-1)
     if backend == 'turbomind':
         engine_config = TurbomindEngineConfig()
         for key, value in kwargs.items():
@@ -86,7 +83,7 @@ def main(model_path, backend, trust_remote_code=False, **kwargs):
                         quit = True
                         break
                     if prompt == 'end':
-                        sess.close()
+                        sess.reset()
                         break
                     if prompt == 'exit':
                         quit = True

diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
@@ -529,7 +529,6 @@ class ResponseType(enum.Enum):
     INPUT_LENGTH_ERROR = enum.auto()
     INTERNAL_ENGINE_ERROR = enum.auto()
     CANCEL = enum.auto()
-    PREFIX_CACHE_CONFLICT_INTERACTIVE_MODE = enum.auto()
     NO_QUEUE = enum.auto()