add tokenizer

This commit is contained in:
musistudio
2025-12-31 12:40:27 +08:00
parent 38cc5feadb
commit b69ebf7ad1
13 changed files with 1116 additions and 13 deletions

View File

@@ -1,4 +1,4 @@
import Server, { calculateTokenCount } from "@musistudio/llms";
import Server, { calculateTokenCount, TokenizerService } from "@musistudio/llms";
import { readConfigFile, writeConfigFile, backupConfigFile } from "./utils";
import { join } from "path";
import fastifyStatic from "@fastify/static";
@@ -34,7 +34,48 @@ export const createServer = async (config: any): Promise<any> => {
});
app.post("/v1/messages/count_tokens", async (req: any, reply: any) => {
const {messages, tools, system} = req.body;
const {messages, tools, system, model} = req.body;
const tokenizerService = (app as any)._server!.tokenizerService as TokenizerService;
// If model is specified in "providerName,modelName" format, use the configured tokenizer
if (model && model.includes(",") && tokenizerService) {
try {
const [provider, modelName] = model.split(",");
req.log?.info(`Looking up tokenizer for provider: ${provider}, model: ${modelName}`);
const tokenizerConfig = tokenizerService.getTokenizerConfigForModel(provider, modelName);
if (!tokenizerConfig) {
req.log?.warn(`No tokenizer config found for ${provider},${modelName}, using default tiktoken`);
} else {
req.log?.info(`Using tokenizer config: ${JSON.stringify(tokenizerConfig)}`);
}
const result = await tokenizerService.countTokens(
{ messages, system, tools },
tokenizerConfig
);
return {
"input_tokens": result.tokenCount,
"tokenizer": result.tokenizerUsed,
};
} catch (error: any) {
req.log?.error(`Error using configured tokenizer: ${error.message}`);
req.log?.error(error.stack);
// Fall back to default calculation
}
} else {
if (!model) {
req.log?.info(`No model specified, using default tiktoken`);
} else if (!model.includes(",")) {
req.log?.info(`Model "${model}" does not contain comma, using default tiktoken`);
} else if (!tokenizerService) {
req.log?.warn(`TokenizerService not available, using default tiktoken`);
}
}
// Default to tiktoken calculation
const tokenCount = calculateTokenCount(messages, system, tools);
return { "input_tokens": tokenCount }
});

View File

@@ -68,4 +68,54 @@ declare module "@musistudio/llms" {
constructor(configService: any, logger: any);
initialize(): Promise<void>;
}
// Tokenizer types
export type TokenizerType = 'tiktoken' | 'huggingface' | 'api';
export type ApiRequestFormat = 'standard' | 'openai' | 'anthropic' | 'custom';
export interface TokenizerConfig {
type: TokenizerType;
encoding?: string;
model?: string;
url?: string;
apiKey?: string;
requestFormat?: ApiRequestFormat;
responseField?: string;
headers?: Record<string, string>;
fallback?: TokenizerType;
}
export interface TokenizeRequest {
messages: Array<{
role: string;
content: string | Array<{
type: string;
text?: string;
input?: any;
content?: string | any;
}>;
}>;
system?: string | Array<{
type: string;
text?: string | string[];
}>;
tools?: Array<{
name: string;
description?: string;
input_schema: object;
}>;
}
export interface TokenizerResult {
tokenCount: number;
tokenizerUsed: string;
cached: boolean;
}
export class TokenizerService {
countTokens(request: TokenizeRequest, config?: TokenizerConfig): Promise<TokenizerResult>;
getTokenizerConfigForModel(providerName: string, modelName: string): TokenizerConfig | undefined;
clearCache(): void;
dispose(): void;
}
}