|
|
@@ -550,7 +550,7 @@ export class LlamaCpp implements LLM {
|
|
|
/**
|
|
|
* Initialize the llama instance (lazy)
|
|
|
*/
|
|
|
- private async ensureLlama(): Promise<Llama> {
|
|
|
+ private async ensureLlama(allowBuild = true): Promise<Llama> {
|
|
|
if (!this.llama) {
|
|
|
// Allow override via QMD_LLAMA_GPU: "false" | "off" | "none" forces CPU
|
|
|
const gpuOverride = (process.env.QMD_LLAMA_GPU ?? "").toLowerCase();
|
|
|
@@ -558,9 +558,10 @@ export class LlamaCpp implements LLM {
|
|
|
|
|
|
const loadLlama = async (gpu: "auto" | false) =>
|
|
|
await getLlama({
|
|
|
- build: "autoAttempt",
|
|
|
+ build: allowBuild ? "autoAttempt" : "never",
|
|
|
logLevel: LlamaLogLevel.error,
|
|
|
gpu,
|
|
|
+ skipDownload: !allowBuild,
|
|
|
});
|
|
|
|
|
|
let llama: Llama;
|
|
|
@@ -1244,14 +1245,14 @@ export class LlamaCpp implements LLM {
|
|
|
* Get device/GPU info for status display.
|
|
|
* Initializes llama if not already done.
|
|
|
*/
|
|
|
- async getDeviceInfo(): Promise<{
|
|
|
+ async getDeviceInfo(options: { allowBuild?: boolean } = {}): Promise<{
|
|
|
gpu: string | false;
|
|
|
gpuOffloading: boolean;
|
|
|
gpuDevices: string[];
|
|
|
vram?: { total: number; used: number; free: number };
|
|
|
cpuCores: number;
|
|
|
}> {
|
|
|
- const llama = await this.ensureLlama();
|
|
|
+ const llama = await this.ensureLlama(options.allowBuild ?? true);
|
|
|
const gpuDevices = await llama.getGpuDeviceNames();
|
|
|
let vram: { total: number; used: number; free: number } | undefined;
|
|
|
if (llama.gpu) {
|