langchain-learning/ollama/tps_monitor.py
2026-04-14 03:19:18 +08:00

58 lines
1.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import time
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
# 配置你的環境
client = OpenAI(
base_url=os.getenv("OLLAMA_BASE_URL", "你的URL"),
api_key=os.getenv("OLLAMA_API_KEY", "你的APIKEY")
)
def test_model_speed(model_name, prompt="請寫一篇關於未來AI發展的500字文章。"):
print(f"🚀 正在測試模型: {model_name} ...")
start_time = time.time()
first_token_time = None
tokens_count = 0
full_response = ""
try:
stream = client.chat.completions.create(
model=model_name,
messages=[{"role": "user", "content": prompt}],
stream=True
)
for chunk in stream:
if chunk.choices[0].delta.content:
if first_token_time is None:
# 紀錄首字時間 (TTFT)
first_token_time = time.time() - start_time
content = chunk.choices[0].delta.content
full_response += content
# 粗略計算法:中文大約 1 字 = 0.6~1 token英文 1 詞 = 1.3 token
# 這裡直接用字數估算,或者如果你想更準確,可以計算 chunk 的數量
tokens_count += 1
total_time = time.time() - start_time
generation_time = total_time - first_token_time
tps = tokens_count / generation_time if generation_time > 0 else 0
print("-" * 30)
print(f"📊 測試結果:")
print(f"⏱️ 首字延遲 (TTFT): {first_token_time:.2f}")
print(f"⚡ 生成速度 (TPS): {tps:.2f} tokens/s")
print(f"🕒 總耗時: {total_time:.2f}")
print(f"📝 總字數: {len(full_response)}")
print("-" * 30)
except Exception as e:
print(f"❌ 測試出錯: {e}")
if __name__ == "__main__":
# 替換成你實際想測的模型名稱
test_model_speed("deepseek-v3.1:671b-cloud")