@@ -33,6 +33,7 @@ def _validate_completions_api_model_support(self) -> None:
3333 def _get_model_prefix (self ) -> str :
3434 return "hosted_vllm"
3535
36+ @track_model_request
3637 async def _generate_native_structured_output (
3738 self ,
3839 input : ChatPromptValue ,
@@ -72,6 +73,7 @@ async def _generate_native_structured_output(
7273 api_key = model_params .auth_token ,
7374 ** extra_params ,
7475 )
76+ self ._extract_token_usage (completion )
7577 resp_text = completion .choices [0 ].model_dump ()["text" ]
7678 else :
7779 # Convert input to messages
@@ -84,6 +86,7 @@ async def _generate_native_structured_output(
8486 api_key = model_params .auth_token ,
8587 ** extra_params ,
8688 )
89+ self ._extract_token_usage (completion )
8790 resp_text = completion .choices [0 ].model_dump ()["message" ]["content" ]
8891 tool_calls = completion .choices [0 ].model_dump ()["message" ]["tool_calls" ]
8992
@@ -157,6 +160,7 @@ async def _generate_response(
157160 api_key = model_params .auth_token ,
158161 ** self .generation_params ,
159162 )
163+ self ._extract_token_usage (completion )
160164 resp_text = completion .choices [0 ].model_dump ()["text" ]
161165 else :
162166 # Convert input to messages
@@ -169,6 +173,7 @@ async def _generate_response(
169173 api_key = model_params .auth_token ,
170174 ** self .generation_params ,
171175 )
176+ self ._extract_token_usage (completion )
172177 resp_text = completion .choices [0 ].model_dump ()["message" ]["content" ]
173178 tool_calls = completion .choices [0 ].model_dump ()["message" ]["tool_calls" ]
174179 # TODO: Test rate limit handling for vllm
@@ -181,7 +186,7 @@ async def _generate_response(
181186 logger .error (f"vLLM request failed with error: { e .message } " )
182187 ret_code = e .status_code
183188 except Exception as x :
184- resp_text = f"{ constants .ERROR_PREFIX } Http request failed { x } "
189+ resp_text = f"{ constants .ERROR_PREFIX } vLLM request failed { x } "
185190 logger .error (resp_text )
186191 rcode = self ._get_status_from_body (x )
187192 if constants .ELEMAI_JOB_DOWN in resp_text or constants .CONNECTION_ERROR in resp_text :
0 commit comments