import os import time from openai import OpenAI from dotenv import load_dotenv load_dotenv() # 配置你的環境 client = OpenAI( base_url=os.getenv("OLLAMA_BASE_URL", "你的URL"), api_key=os.getenv("OLLAMA_API_KEY", "你的APIKEY") ) def test_model_speed(model_name, prompt="請寫一篇關於未來AI發展的500字文章。"): print(f"🚀 正在測試模型: {model_name} ...") start_time = time.time() first_token_time = None tokens_count = 0 full_response = "" try: stream = client.chat.completions.create( model=model_name, messages=[{"role": "user", "content": prompt}], stream=True ) for chunk in stream: if chunk.choices[0].delta.content: if first_token_time is None: # 紀錄首字時間 (TTFT) first_token_time = time.time() - start_time content = chunk.choices[0].delta.content full_response += content # 粗略計算法:中文大約 1 字 = 0.6~1 token,英文 1 詞 = 1.3 token # 這裡直接用字數估算,或者如果你想更準確,可以計算 chunk 的數量 tokens_count += 1 total_time = time.time() - start_time generation_time = total_time - first_token_time tps = tokens_count / generation_time if generation_time > 0 else 0 print("-" * 30) print(f"📊 測試結果:") print(f"⏱️ 首字延遲 (TTFT): {first_token_time:.2f} 秒") print(f"⚡ 生成速度 (TPS): {tps:.2f} tokens/s") print(f"🕒 總耗時: {total_time:.2f} 秒") print(f"📝 總字數: {len(full_response)} 字") print("-" * 30) except Exception as e: print(f"❌ 測試出錯: {e}") if __name__ == "__main__": # 替換成你實際想測的模型名稱 test_model_speed("deepseek-v3.1:671b-cloud")