From 4eb3c85c68d2982f4f3f6e75197ee8efa8e7056c Mon Sep 17 00:00:00 2001 From: axel7083 <42176370+axel7083@users.noreply.github.com> Date: Mon, 16 Sep 2024 22:34:45 +0200 Subject: [PATCH 01/14] feat(hub): support scan cache --- packages/hub/src/lib/cache-management.spec.ts | 116 ++++++++ packages/hub/src/lib/cache-management.ts | 252 ++++++++++++++++++ 2 files changed, 368 insertions(+) create mode 100644 packages/hub/src/lib/cache-management.spec.ts create mode 100644 packages/hub/src/lib/cache-management.ts diff --git a/packages/hub/src/lib/cache-management.spec.ts b/packages/hub/src/lib/cache-management.spec.ts new file mode 100644 index 000000000..5d406acb3 --- /dev/null +++ b/packages/hub/src/lib/cache-management.spec.ts @@ -0,0 +1,116 @@ +import { describe, test, expect, vi, beforeEach } from 'vitest'; +import * as cacheManagement from './cache-management'; +import { stat, readdir, realpath, lstat } from 'node:fs/promises'; +import { Stats } from "node:fs"; + +// Mocks +vi.mock('node:fs/promises'); + +beforeEach(() => { + vi.resetAllMocks(); + vi.restoreAllMocks(); +}); + +describe('scan_cache_dir', () => { + test('should throw an error if cacheDir is not a directory', async () => { + vi.mocked(stat).mockResolvedValueOnce({ + isDirectory: () => false + } as Stats); + + await expect(cacheManagement.scan_cache_dir('/fake/dir')).rejects.toThrow('Scan cache expects a directory'); + }); + + test('should scan a valid cache directory', async () => { + vi.mocked(stat).mockResolvedValueOnce({ + isDirectory: () => true + } as Stats); + + vi.mocked(readdir).mockResolvedValueOnce(['repo1', 'repo2']); + + vi.mocked(stat).mockResolvedValueOnce({ isDirectory: () => true } as any); + vi.mocked(stat).mockResolvedValueOnce({ isDirectory: () => true } as any); + + vi.spyOn(cacheManagement, 'scan_cached_repo').mockResolvedValueOnce({ repo_id: 'repo1', size_on_disk: 100 } as any); + vi.spyOn(cacheManagement, 'scan_cached_repo').mockResolvedValueOnce({ repo_id: 'repo2', size_on_disk: 200 } as any); + + const result = await cacheManagement.scan_cache_dir('/fake/dir'); + + expect(result.size_on_disk).toBe(300); + expect(result.repos.size).toBe(2); + }); +}); + +describe('scan_cached_repo', () => { + test('should throw an error for invalid repo path', async () => { + await expect(() => { + return cacheManagement.scan_cached_repo('/fake/repo_path'); + }).rejects.toThrow('Repo path is not a valid HuggingFace cache directory'); + }); + + test('should return CachedRepoInfo for a valid repo', async () => { + const repoPath = '/fake/model--repo1'; + + vi.mocked(stat).mockResolvedValue({ isDirectory: () => true } as unknown as Stats); + vi.mocked(readdir).mockImplementationOnce(async () => { + return ['snapshot1', 'snapshot2'] as unknown as ReturnType + } + ); + vi.spyOn(cacheManagement, 'scanSnapshotDir').mockResolvedValueOnce(undefined); + + const result = await cacheManagement.scan_cached_repo(repoPath); + + expect(result.repo_id).toBe('repo1'); + expect(result.repo_type).toBe(cacheManagement.REPO_TYPE_T.MODEL); + }); +}); + + + +describe('scanSnapshotDir', () => { + test('should scan a valid snapshot directory', async () => { + const cachedFiles = new Set(); + const blobStats = new Map(); + vi.mocked(readdir).mockResolvedValueOnce([{ name: 'file1', isDirectory: () => false }]); + + vi.mocked(realpath).mockResolvedValueOnce('/fake/realpath'); + vi.mocked(lstat).mockResolvedValueOnce({ size: 1024, atimeMs: Date.now(), mtimeMs: Date.now() } as any); + + await cacheManagement.scanSnapshotDir('/fake/revision', cachedFiles, blobStats); + + expect(cachedFiles.size).toBe(1); + expect(blobStats.size).toBe(1); + }); +}); + +describe('getBlobStat', () => { + test('should retrieve blob stat if already cached', async () => { + const blobStats = new Map([['/fake/blob', { size: 1024 } as any]]); + const result = await cacheManagement.getBlobStat('/fake/blob', blobStats); + + expect(result.size).toBe(1024); + }); + + test('should fetch and cache blob stat if not cached', async () => { + const blobStats = new Map(); + vi.mocked(lstat).mockResolvedValueOnce({ size: 2048 } as any); + + const result = await cacheManagement.getBlobStat('/fake/blob', blobStats); + + expect(result.size).toBe(2048); + expect(blobStats.size).toBe(1); + }); +}); + +describe('parseRepoType', () => { + test('should parse model repo type', () => { + expect(cacheManagement.parseRepoType('model')).toBe(cacheManagement.REPO_TYPE_T.MODEL); + }); + + test('should parse dataset repo type', () => { + expect(cacheManagement.parseRepoType('dataset')).toBe(cacheManagement.REPO_TYPE_T.DATASET); + }); + + test('should throw an error for invalid repo type', () => { + expect(() => cacheManagement.parseRepoType('invalid')).toThrow(); + }); +}); diff --git a/packages/hub/src/lib/cache-management.ts b/packages/hub/src/lib/cache-management.ts new file mode 100644 index 000000000..6fa7a1d4d --- /dev/null +++ b/packages/hub/src/lib/cache-management.ts @@ -0,0 +1,252 @@ +import { homedir } from "node:os"; +import { join, basename } from "node:path"; +import { stat, readdir, readFile, realpath, lstat } from "node:fs/promises"; +import type { Stats } from "node:fs"; + +const default_home = join(homedir(), ".cache"); +export const HF_HOME: string = process.env["HF_HOME"] ?? ( + join(process.env["XDG_CACHE_HOME"] ?? default_home, "huggingface") +); + +const default_cache_path = join(HF_HOME, "hub"); + +// Legacy env variable +export const HUGGINGFACE_HUB_CACHE = process.env['HUGGINGFACE_HUB_CACHE'] ?? default_cache_path; +// New env variable +export const HF_HUB_CACHE = process.env["HF_HUB_CACHE"] ?? HUGGINGFACE_HUB_CACHE; + +const FILES_TO_IGNORE: string[] = [".DS_Store"] + +export enum REPO_TYPE_T { + MODEL = "model", + DATASET = "dataset", + SPACE = "space", +} + +export interface CachedFileInfo { + file_name: string + file_path: string + blob_path: string + size_on_disk: number + + blob_last_accessed: number + blob_last_modified: number +} + +export interface CachedRevisionInfo { + commit_hash: string + snapshot_path: string + size_on_disk: number + readonly files: Set + readonly refs: Set + + last_modified: number +} + +export interface CachedRepoInfo { + repo_id: string + repo_type: REPO_TYPE_T + repo_path: string + size_on_disk: number + nb_files: number + readonly revisions: Set + + last_accessed: number + last_modified: number +} + +export interface HFCacheInfo { + size_on_disk: number + readonly repos: Set + warnings: Error[] +} + +export async function scan_cache_dir(cacheDir: string | undefined = undefined): Promise { + if (!cacheDir) + cacheDir = HF_HUB_CACHE + + const s = await stat(cacheDir); + if(!s.isDirectory()) { + throw new Error("Scan cache expects a directory but found a file: {cache_dir}. Please use `cache_dir` argument or set `HF_HUB_CACHE` environment variable.") + } + + const repos = new Set(); + const warnings: Error[] = []; + + const directories = await readdir(cacheDir); + for (const repo of directories) { + // skip .locks folder + if(repo === ".locks") continue; + + // get the absolute path of the repo + const absolute = join(cacheDir, repo); + + // ignore non-directory element + const s = await stat(absolute); + if(!s.isDirectory()) { + continue; + } + + try { + const cached = await scan_cached_repo(absolute); + repos.add(cached); + } catch (err: unknown) { + warnings.push(err as Error); + } + } + + return { + repos: repos, + size_on_disk: [...repos.values()].reduce((sum, repo) => sum + repo.size_on_disk, 0), + warnings: warnings, + }; +} + +export async function scan_cached_repo(repo_path: string): Promise { + // get the directory name + const name = basename(repo_path); + if(!name.includes('--')) { + throw new Error(`Repo path is not a valid HuggingFace cache directory: ${name}`); + } + + // parse the repoId from directory name + const [type, ...remaining] = name.split('--'); + const repoType = parseRepoType(type); + const repoId = remaining.join('/'); + + const snapshotsPath = join(repo_path, 'snapshots'); + const refsPath = join(repo_path, 'refs'); + + const snapshotStat = await stat(snapshotsPath); + if(!snapshotStat.isDirectory()) { + throw new Error(`Snapshots dir doesn't exist in cached repo ${snapshotsPath}`); + } + + // Check if the refs directory exists and scan it + const refsByHash: Map> = new Map(); + const refsStat = await stat(refsPath); + if (refsStat.isDirectory()) { + await scanRefsDir(refsPath, refsByHash); + } + + // Scan snapshots directory and collect cached revision information + const cachedRevisions: Set = new Set(); + const blobStats: Map = new Map(); // Store blob stats + + const snapshotDirs = await readdir(snapshotsPath); + for (const dir of snapshotDirs) { + if (FILES_TO_IGNORE.includes(dir)) continue; // Ignore unwanted files + + const revisionPath = join(snapshotsPath, dir); + const revisionStat = await stat(revisionPath); + if (!revisionStat.isDirectory()) { + throw new Error(`Snapshots folder corrupted. Found a file: ${revisionPath}`); + } + + const cachedFiles: Set = new Set(); + await scanSnapshotDir(revisionPath, cachedFiles, blobStats); + + const revisionLastModified = cachedFiles.size > 0 + ? Math.max(...[...cachedFiles].map(file => file.blob_last_modified)) + : revisionStat.mtimeMs; + + cachedRevisions.add({ + commit_hash: dir, + files: cachedFiles, + refs: refsByHash.get(dir) || new Set(), + size_on_disk: [...cachedFiles].reduce((sum, file) => sum + file.size_on_disk, 0), + snapshot_path: revisionPath, + last_modified: revisionLastModified + }); + + refsByHash.delete(dir); + } + + // Verify that all refs refer to a valid revision + // TODO: not sure what this is ????? + if (refsByHash.size > 0) { + throw new Error( + `Reference(s) refer to missing commit hashes: ${JSON.stringify(Object.fromEntries(refsByHash))} (${repo_path})` + ); + } + + const repoStats = await stat(repo_path); + const repoLastAccessed = blobStats.size > 0 + ? Math.max(...[...blobStats.values()].map(stat => stat.atimeMs)) + : repoStats.atimeMs; + + const repoLastModified = blobStats.size > 0 + ? Math.max(...[...blobStats.values()].map(stat => stat.mtimeMs)) + : repoStats.mtimeMs; + + // Return the constructed CachedRepoInfo object + return { + repo_id: repoId, + repo_type: repoType, + repo_path: repo_path, + nb_files: blobStats.size, + revisions: cachedRevisions, + size_on_disk: [...blobStats.values()].reduce((sum, stat) => sum + stat.size, 0), + last_accessed: repoLastAccessed, + last_modified: repoLastModified + }; +} + +export async function scanRefsDir(refsPath: string, refsByHash: Map>): Promise { + const refFiles = await readdir(refsPath, { withFileTypes: true }); + for (const refFile of refFiles) { + const refFilePath = join(refsPath, refFile.name); + if (refFile.isDirectory()) continue; // Skip directories + + const commitHash = await readFile(refFilePath, 'utf-8'); + const refName = refFile.name; + if (!refsByHash.has(commitHash)) { + refsByHash.set(commitHash, new Set()); + } + refsByHash.get(commitHash)?.add(refName); + } +} + +export async function scanSnapshotDir(revisionPath: string, cachedFiles: Set, blobStats: Map): Promise { + const files = await readdir(revisionPath, { withFileTypes: true }); + for (const file of files) { + if (file.isDirectory()) continue; // Skip directories + + const filePath = join(revisionPath, file.name); + const blobPath = await realpath(filePath); + const blobStat = await getBlobStat(blobPath, blobStats); + + cachedFiles.add({ + file_name: file.name, + file_path: filePath, + blob_path: blobPath, + size_on_disk: blobStat.size, + blob_last_accessed: blobStat.atimeMs, + blob_last_modified: blobStat.mtimeMs + }); + } +} + +export async function getBlobStat(blobPath: string, blobStats: Map): Promise { + const blob = blobStats.get(blobPath); + if (!blob) { + const statResult = await lstat(blobPath); + blobStats.set(blobPath, statResult); + return statResult; + } + return blob; +} + +export function parseRepoType(type: string): REPO_TYPE_T { + switch (type) { + case 'models': + case 'model': + return REPO_TYPE_T.MODEL; + case REPO_TYPE_T.DATASET: + return REPO_TYPE_T.DATASET; + case REPO_TYPE_T.SPACE: + return REPO_TYPE_T.SPACE; + default: + throw new Error('') + } +} \ No newline at end of file From 8e4e46e16d0dd07ac8fe7cdbebac3cde3ec87f64 Mon Sep 17 00:00:00 2001 From: axel7083 <42176370+axel7083@users.noreply.github.com> Date: Tue, 17 Sep 2024 18:09:29 +0200 Subject: [PATCH 02/14] fix: formatting and linter --- packages/hub/src/lib/cache-management.spec.ts | 134 ++++++++++-------- packages/hub/src/lib/cache-management.ts | 122 ++++++++-------- 2 files changed, 137 insertions(+), 119 deletions(-) diff --git a/packages/hub/src/lib/cache-management.spec.ts b/packages/hub/src/lib/cache-management.spec.ts index 5d406acb3..57f4ad353 100644 --- a/packages/hub/src/lib/cache-management.spec.ts +++ b/packages/hub/src/lib/cache-management.spec.ts @@ -1,116 +1,132 @@ -import { describe, test, expect, vi, beforeEach } from 'vitest'; -import * as cacheManagement from './cache-management'; -import { stat, readdir, realpath, lstat } from 'node:fs/promises'; -import { Stats } from "node:fs"; +import { describe, test, expect, vi, beforeEach } from "vitest"; +import { + scan_cache_dir, + scan_cached_repo, + REPO_TYPE_T, + scanSnapshotDir, + parseRepoType, + getBlobStat, + type CachedFileInfo, +} from "./cache-management"; +import { stat, readdir, realpath, lstat } from "node:fs/promises"; +import type { Dirent, Stats } from "node:fs"; +import { join } from "node:path"; // Mocks -vi.mock('node:fs/promises'); +vi.mock("node:fs/promises"); beforeEach(() => { vi.resetAllMocks(); vi.restoreAllMocks(); }); -describe('scan_cache_dir', () => { - test('should throw an error if cacheDir is not a directory', async () => { +describe("scan_cache_dir", () => { + test("should throw an error if cacheDir is not a directory", async () => { vi.mocked(stat).mockResolvedValueOnce({ - isDirectory: () => false + isDirectory: () => false, } as Stats); - await expect(cacheManagement.scan_cache_dir('/fake/dir')).rejects.toThrow('Scan cache expects a directory'); + await expect(scan_cache_dir("/fake/dir")).rejects.toThrow("Scan cache expects a directory"); }); - test('should scan a valid cache directory', async () => { + test("empty directory should return an empty set of repository and no warnings", async () => { vi.mocked(stat).mockResolvedValueOnce({ - isDirectory: () => true + isDirectory: () => true, } as Stats); - vi.mocked(readdir).mockResolvedValueOnce(['repo1', 'repo2']); + // mock empty cache folder + vi.mocked(readdir).mockResolvedValue([]); - vi.mocked(stat).mockResolvedValueOnce({ isDirectory: () => true } as any); - vi.mocked(stat).mockResolvedValueOnce({ isDirectory: () => true } as any); + const result = await scan_cache_dir("/fake/dir"); - vi.spyOn(cacheManagement, 'scan_cached_repo').mockResolvedValueOnce({ repo_id: 'repo1', size_on_disk: 100 } as any); - vi.spyOn(cacheManagement, 'scan_cached_repo').mockResolvedValueOnce({ repo_id: 'repo2', size_on_disk: 200 } as any); + // cacheDir must have been read + expect(readdir).toHaveBeenCalledWith("/fake/dir"); - const result = await cacheManagement.scan_cache_dir('/fake/dir'); - - expect(result.size_on_disk).toBe(300); - expect(result.repos.size).toBe(2); + expect(result.warnings.length).toBe(0); + expect(result.repos.size).toBe(0); + expect(result.size_on_disk).toBe(0); }); }); -describe('scan_cached_repo', () => { - test('should throw an error for invalid repo path', async () => { +describe("scan_cached_repo", () => { + test("should throw an error for invalid repo path", async () => { await expect(() => { - return cacheManagement.scan_cached_repo('/fake/repo_path'); - }).rejects.toThrow('Repo path is not a valid HuggingFace cache directory'); + return scan_cached_repo("/fake/repo_path"); + }).rejects.toThrow("Repo path is not a valid HuggingFace cache directory"); }); - test('should return CachedRepoInfo for a valid repo', async () => { - const repoPath = '/fake/model--repo1'; + test("should throw an error if the snapshot folder does not exist", async () => { + vi.mocked(readdir).mockResolvedValue([]); + vi.mocked(stat).mockResolvedValue({ + isDirectory: () => false, + } as Stats); - vi.mocked(stat).mockResolvedValue({ isDirectory: () => true } as unknown as Stats); - vi.mocked(readdir).mockImplementationOnce(async () => { - return ['snapshot1', 'snapshot2'] as unknown as ReturnType - } - ); - vi.spyOn(cacheManagement, 'scanSnapshotDir').mockResolvedValueOnce(undefined); + await expect(() => { + return scan_cached_repo("/fake/cacheDir/models--hello-world--name"); + }).rejects.toThrow("Snapshots dir doesn't exist in cached repo"); + }); - const result = await cacheManagement.scan_cached_repo(repoPath); + test("should properly parse the repository name", async () => { + const repoPath = "/fake/cacheDir/models--hello-world--name"; + vi.mocked(readdir).mockResolvedValue([]); + vi.mocked(stat).mockResolvedValue({ + isDirectory: () => true, + } as Stats); - expect(result.repo_id).toBe('repo1'); - expect(result.repo_type).toBe(cacheManagement.REPO_TYPE_T.MODEL); + const result = await scan_cached_repo(repoPath); + expect(readdir).toHaveBeenCalledWith(join(repoPath, "refs"), { + withFileTypes: true, + }); + + expect(result.repo_id).toBe("hello-world/name"); }); }); +describe("scanSnapshotDir", () => { + test("should scan a valid snapshot directory", async () => { + const cachedFiles = new Set(); + const blobStats = new Map(); + vi.mocked(readdir).mockResolvedValueOnce([{ name: "file1", isDirectory: () => false } as Dirent]); + vi.mocked(realpath).mockResolvedValueOnce("/fake/realpath"); + vi.mocked(lstat).mockResolvedValueOnce({ size: 1024, atimeMs: Date.now(), mtimeMs: Date.now() } as Stats); -describe('scanSnapshotDir', () => { - test('should scan a valid snapshot directory', async () => { - const cachedFiles = new Set(); - const blobStats = new Map(); - vi.mocked(readdir).mockResolvedValueOnce([{ name: 'file1', isDirectory: () => false }]); - - vi.mocked(realpath).mockResolvedValueOnce('/fake/realpath'); - vi.mocked(lstat).mockResolvedValueOnce({ size: 1024, atimeMs: Date.now(), mtimeMs: Date.now() } as any); - - await cacheManagement.scanSnapshotDir('/fake/revision', cachedFiles, blobStats); + await scanSnapshotDir("/fake/revision", cachedFiles, blobStats); expect(cachedFiles.size).toBe(1); expect(blobStats.size).toBe(1); }); }); -describe('getBlobStat', () => { - test('should retrieve blob stat if already cached', async () => { - const blobStats = new Map([['/fake/blob', { size: 1024 } as any]]); - const result = await cacheManagement.getBlobStat('/fake/blob', blobStats); +describe("getBlobStat", () => { + test("should retrieve blob stat if already cached", async () => { + const blobStats = new Map([["/fake/blob", { size: 1024 } as Stats]]); + const result = await getBlobStat("/fake/blob", blobStats); expect(result.size).toBe(1024); }); - test('should fetch and cache blob stat if not cached', async () => { + test("should fetch and cache blob stat if not cached", async () => { const blobStats = new Map(); - vi.mocked(lstat).mockResolvedValueOnce({ size: 2048 } as any); + vi.mocked(lstat).mockResolvedValueOnce({ size: 2048 } as Stats); - const result = await cacheManagement.getBlobStat('/fake/blob', blobStats); + const result = await getBlobStat("/fake/blob", blobStats); expect(result.size).toBe(2048); expect(blobStats.size).toBe(1); }); }); -describe('parseRepoType', () => { - test('should parse model repo type', () => { - expect(cacheManagement.parseRepoType('model')).toBe(cacheManagement.REPO_TYPE_T.MODEL); +describe("parseRepoType", () => { + test("should parse model repo type", () => { + expect(parseRepoType("model")).toBe(REPO_TYPE_T.MODEL); }); - test('should parse dataset repo type', () => { - expect(cacheManagement.parseRepoType('dataset')).toBe(cacheManagement.REPO_TYPE_T.DATASET); + test("should parse dataset repo type", () => { + expect(parseRepoType("dataset")).toBe(REPO_TYPE_T.DATASET); }); - test('should throw an error for invalid repo type', () => { - expect(() => cacheManagement.parseRepoType('invalid')).toThrow(); + test("should throw an error for invalid repo type", () => { + expect(() => parseRepoType("invalid")).toThrow(); }); }); diff --git a/packages/hub/src/lib/cache-management.ts b/packages/hub/src/lib/cache-management.ts index 6fa7a1d4d..bbf733beb 100644 --- a/packages/hub/src/lib/cache-management.ts +++ b/packages/hub/src/lib/cache-management.ts @@ -4,18 +4,17 @@ import { stat, readdir, readFile, realpath, lstat } from "node:fs/promises"; import type { Stats } from "node:fs"; const default_home = join(homedir(), ".cache"); -export const HF_HOME: string = process.env["HF_HOME"] ?? ( - join(process.env["XDG_CACHE_HOME"] ?? default_home, "huggingface") -); +export const HF_HOME: string = + process.env["HF_HOME"] ?? join(process.env["XDG_CACHE_HOME"] ?? default_home, "huggingface"); const default_cache_path = join(HF_HOME, "hub"); // Legacy env variable -export const HUGGINGFACE_HUB_CACHE = process.env['HUGGINGFACE_HUB_CACHE'] ?? default_cache_path; +export const HUGGINGFACE_HUB_CACHE = process.env["HUGGINGFACE_HUB_CACHE"] ?? default_cache_path; // New env variable export const HF_HUB_CACHE = process.env["HF_HUB_CACHE"] ?? HUGGINGFACE_HUB_CACHE; -const FILES_TO_IGNORE: string[] = [".DS_Store"] +const FILES_TO_IGNORE: string[] = [".DS_Store"]; export enum REPO_TYPE_T { MODEL = "model", @@ -24,50 +23,51 @@ export enum REPO_TYPE_T { } export interface CachedFileInfo { - file_name: string - file_path: string - blob_path: string - size_on_disk: number + file_name: string; + file_path: string; + blob_path: string; + size_on_disk: number; - blob_last_accessed: number - blob_last_modified: number + blob_last_accessed: number; + blob_last_modified: number; } export interface CachedRevisionInfo { - commit_hash: string - snapshot_path: string - size_on_disk: number - readonly files: Set - readonly refs: Set + commit_hash: string; + snapshot_path: string; + size_on_disk: number; + readonly files: Set; + readonly refs: Set; - last_modified: number + last_modified: number; } export interface CachedRepoInfo { - repo_id: string - repo_type: REPO_TYPE_T - repo_path: string - size_on_disk: number - nb_files: number - readonly revisions: Set - - last_accessed: number - last_modified: number + repo_id: string; + repo_type: REPO_TYPE_T; + repo_path: string; + size_on_disk: number; + nb_files: number; + readonly revisions: Set; + + last_accessed: number; + last_modified: number; } export interface HFCacheInfo { - size_on_disk: number - readonly repos: Set - warnings: Error[] + size_on_disk: number; + readonly repos: Set; + warnings: Error[]; } export async function scan_cache_dir(cacheDir: string | undefined = undefined): Promise { - if (!cacheDir) - cacheDir = HF_HUB_CACHE + if (!cacheDir) cacheDir = HF_HUB_CACHE; const s = await stat(cacheDir); - if(!s.isDirectory()) { - throw new Error("Scan cache expects a directory but found a file: {cache_dir}. Please use `cache_dir` argument or set `HF_HUB_CACHE` environment variable.") + if (!s.isDirectory()) { + throw new Error( + `Scan cache expects a directory but found a file: ${cacheDir}. Please use \`cacheDir\` argument or set \`HF_HUB_CACHE\` environment variable.` + ); } const repos = new Set(); @@ -76,14 +76,14 @@ export async function scan_cache_dir(cacheDir: string | undefined = undefined): const directories = await readdir(cacheDir); for (const repo of directories) { // skip .locks folder - if(repo === ".locks") continue; + if (repo === ".locks") continue; // get the absolute path of the repo const absolute = join(cacheDir, repo); // ignore non-directory element const s = await stat(absolute); - if(!s.isDirectory()) { + if (!s.isDirectory()) { continue; } @@ -105,20 +105,20 @@ export async function scan_cache_dir(cacheDir: string | undefined = undefined): export async function scan_cached_repo(repo_path: string): Promise { // get the directory name const name = basename(repo_path); - if(!name.includes('--')) { + if (!name.includes("--")) { throw new Error(`Repo path is not a valid HuggingFace cache directory: ${name}`); } // parse the repoId from directory name - const [type, ...remaining] = name.split('--'); + const [type, ...remaining] = name.split("--"); const repoType = parseRepoType(type); - const repoId = remaining.join('/'); + const repoId = remaining.join("/"); - const snapshotsPath = join(repo_path, 'snapshots'); - const refsPath = join(repo_path, 'refs'); + const snapshotsPath = join(repo_path, "snapshots"); + const refsPath = join(repo_path, "refs"); const snapshotStat = await stat(snapshotsPath); - if(!snapshotStat.isDirectory()) { + if (!snapshotStat.isDirectory()) { throw new Error(`Snapshots dir doesn't exist in cached repo ${snapshotsPath}`); } @@ -146,9 +146,10 @@ export async function scan_cached_repo(repo_path: string): Promise = new Set(); await scanSnapshotDir(revisionPath, cachedFiles, blobStats); - const revisionLastModified = cachedFiles.size > 0 - ? Math.max(...[...cachedFiles].map(file => file.blob_last_modified)) - : revisionStat.mtimeMs; + const revisionLastModified = + cachedFiles.size > 0 + ? Math.max(...[...cachedFiles].map((file) => file.blob_last_modified)) + : revisionStat.mtimeMs; cachedRevisions.add({ commit_hash: dir, @@ -156,14 +157,13 @@ export async function scan_cached_repo(repo_path: string): Promise sum + file.size_on_disk, 0), snapshot_path: revisionPath, - last_modified: revisionLastModified + last_modified: revisionLastModified, }); refsByHash.delete(dir); } // Verify that all refs refer to a valid revision - // TODO: not sure what this is ????? if (refsByHash.size > 0) { throw new Error( `Reference(s) refer to missing commit hashes: ${JSON.stringify(Object.fromEntries(refsByHash))} (${repo_path})` @@ -171,13 +171,11 @@ export async function scan_cached_repo(repo_path: string): Promise 0 - ? Math.max(...[...blobStats.values()].map(stat => stat.atimeMs)) - : repoStats.atimeMs; + const repoLastAccessed = + blobStats.size > 0 ? Math.max(...[...blobStats.values()].map((stat) => stat.atimeMs)) : repoStats.atimeMs; - const repoLastModified = blobStats.size > 0 - ? Math.max(...[...blobStats.values()].map(stat => stat.mtimeMs)) - : repoStats.mtimeMs; + const repoLastModified = + blobStats.size > 0 ? Math.max(...[...blobStats.values()].map((stat) => stat.mtimeMs)) : repoStats.mtimeMs; // Return the constructed CachedRepoInfo object return { @@ -188,7 +186,7 @@ export async function scan_cached_repo(repo_path: string): Promise sum + stat.size, 0), last_accessed: repoLastAccessed, - last_modified: repoLastModified + last_modified: repoLastModified, }; } @@ -198,7 +196,7 @@ export async function scanRefsDir(refsPath: string, refsByHash: Map, blobStats: Map): Promise { +export async function scanSnapshotDir( + revisionPath: string, + cachedFiles: Set, + blobStats: Map +): Promise { const files = await readdir(revisionPath, { withFileTypes: true }); for (const file of files) { if (file.isDirectory()) continue; // Skip directories @@ -222,7 +224,7 @@ export async function scanSnapshotDir(revisionPath: string, cachedFiles: Set Date: Tue, 17 Sep 2024 18:12:46 +0200 Subject: [PATCH 03/14] fix: tests --- packages/hub/src/lib/cache-management.spec.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/packages/hub/src/lib/cache-management.spec.ts b/packages/hub/src/lib/cache-management.spec.ts index 57f4ad353..753ebbf87 100644 --- a/packages/hub/src/lib/cache-management.spec.ts +++ b/packages/hub/src/lib/cache-management.spec.ts @@ -103,6 +103,7 @@ describe("getBlobStat", () => { const blobStats = new Map([["/fake/blob", { size: 1024 } as Stats]]); const result = await getBlobStat("/fake/blob", blobStats); + expect(lstat).not.toHaveBeenCalled(); expect(result.size).toBe(1024); }); @@ -118,6 +119,10 @@ describe("getBlobStat", () => { }); describe("parseRepoType", () => { + test("should parse models repo type", () => { + expect(parseRepoType("models")).toBe(REPO_TYPE_T.MODEL); + }); + test("should parse model repo type", () => { expect(parseRepoType("model")).toBe(REPO_TYPE_T.MODEL); }); From 629e8c5d7ee49d37739b845add958628674aef75 Mon Sep 17 00:00:00 2001 From: axel7083 <42176370+axel7083@users.noreply.github.com> Date: Tue, 17 Sep 2024 18:18:03 +0200 Subject: [PATCH 04/14] fix: naming convention --- packages/hub/src/lib/cache-management.spec.ts | 22 ++--- packages/hub/src/lib/cache-management.ts | 92 +++++++++---------- 2 files changed, 56 insertions(+), 58 deletions(-) diff --git a/packages/hub/src/lib/cache-management.spec.ts b/packages/hub/src/lib/cache-management.spec.ts index 753ebbf87..a8fef9cf2 100644 --- a/packages/hub/src/lib/cache-management.spec.ts +++ b/packages/hub/src/lib/cache-management.spec.ts @@ -1,7 +1,7 @@ import { describe, test, expect, vi, beforeEach } from "vitest"; import { - scan_cache_dir, - scan_cached_repo, + scanCacheDir, + scanCachedRepo, REPO_TYPE_T, scanSnapshotDir, parseRepoType, @@ -20,13 +20,13 @@ beforeEach(() => { vi.restoreAllMocks(); }); -describe("scan_cache_dir", () => { +describe("scanCacheDir", () => { test("should throw an error if cacheDir is not a directory", async () => { vi.mocked(stat).mockResolvedValueOnce({ isDirectory: () => false, } as Stats); - await expect(scan_cache_dir("/fake/dir")).rejects.toThrow("Scan cache expects a directory"); + await expect(scanCacheDir("/fake/dir")).rejects.toThrow("Scan cache expects a directory"); }); test("empty directory should return an empty set of repository and no warnings", async () => { @@ -37,21 +37,21 @@ describe("scan_cache_dir", () => { // mock empty cache folder vi.mocked(readdir).mockResolvedValue([]); - const result = await scan_cache_dir("/fake/dir"); + const result = await scanCacheDir("/fake/dir"); // cacheDir must have been read expect(readdir).toHaveBeenCalledWith("/fake/dir"); expect(result.warnings.length).toBe(0); expect(result.repos.size).toBe(0); - expect(result.size_on_disk).toBe(0); + expect(result.sizeOnDisk).toBe(0); }); }); -describe("scan_cached_repo", () => { +describe("scanCachedRepo", () => { test("should throw an error for invalid repo path", async () => { await expect(() => { - return scan_cached_repo("/fake/repo_path"); + return scanCachedRepo("/fake/repo_path"); }).rejects.toThrow("Repo path is not a valid HuggingFace cache directory"); }); @@ -62,7 +62,7 @@ describe("scan_cached_repo", () => { } as Stats); await expect(() => { - return scan_cached_repo("/fake/cacheDir/models--hello-world--name"); + return scanCachedRepo("/fake/cacheDir/models--hello-world--name"); }).rejects.toThrow("Snapshots dir doesn't exist in cached repo"); }); @@ -73,12 +73,12 @@ describe("scan_cached_repo", () => { isDirectory: () => true, } as Stats); - const result = await scan_cached_repo(repoPath); + const result = await scanCachedRepo(repoPath); expect(readdir).toHaveBeenCalledWith(join(repoPath, "refs"), { withFileTypes: true, }); - expect(result.repo_id).toBe("hello-world/name"); + expect(result.repoId).toBe("hello-world/name"); }); }); diff --git a/packages/hub/src/lib/cache-management.ts b/packages/hub/src/lib/cache-management.ts index bbf733beb..94cb758d4 100644 --- a/packages/hub/src/lib/cache-management.ts +++ b/packages/hub/src/lib/cache-management.ts @@ -23,44 +23,44 @@ export enum REPO_TYPE_T { } export interface CachedFileInfo { - file_name: string; - file_path: string; - blob_path: string; - size_on_disk: number; + filename: string; + filePath: string; + blobPath: string; + sizeOnDisk: number; - blob_last_accessed: number; - blob_last_modified: number; + blobLastAccessed: number; + blobLastModified: number; } export interface CachedRevisionInfo { - commit_hash: string; - snapshot_path: string; - size_on_disk: number; + commitHash: string; + snapshotPath: string; + sizeOnDisk: number; readonly files: Set; readonly refs: Set; - last_modified: number; + lastModified: number; } export interface CachedRepoInfo { - repo_id: string; - repo_type: REPO_TYPE_T; - repo_path: string; - size_on_disk: number; - nb_files: number; + repoId: string; + repoType: REPO_TYPE_T; + repoPath: string; + sizeOnDisk: number; + nbFiles: number; readonly revisions: Set; - last_accessed: number; - last_modified: number; + lastAccessed: number; + lastModified: number; } export interface HFCacheInfo { - size_on_disk: number; + sizeOnDisk: number; readonly repos: Set; warnings: Error[]; } -export async function scan_cache_dir(cacheDir: string | undefined = undefined): Promise { +export async function scanCacheDir(cacheDir: string | undefined = undefined): Promise { if (!cacheDir) cacheDir = HF_HUB_CACHE; const s = await stat(cacheDir); @@ -88,7 +88,7 @@ export async function scan_cache_dir(cacheDir: string | undefined = undefined): } try { - const cached = await scan_cached_repo(absolute); + const cached = await scanCachedRepo(absolute); repos.add(cached); } catch (err: unknown) { warnings.push(err as Error); @@ -97,14 +97,14 @@ export async function scan_cache_dir(cacheDir: string | undefined = undefined): return { repos: repos, - size_on_disk: [...repos.values()].reduce((sum, repo) => sum + repo.size_on_disk, 0), + sizeOnDisk: [...repos.values()].reduce((sum, repo) => sum + repo.sizeOnDisk, 0), warnings: warnings, }; } -export async function scan_cached_repo(repo_path: string): Promise { +export async function scanCachedRepo(repoPath: string): Promise { // get the directory name - const name = basename(repo_path); + const name = basename(repoPath); if (!name.includes("--")) { throw new Error(`Repo path is not a valid HuggingFace cache directory: ${name}`); } @@ -114,8 +114,8 @@ export async function scan_cached_repo(repo_path: string): Promise 0 - ? Math.max(...[...cachedFiles].map((file) => file.blob_last_modified)) - : revisionStat.mtimeMs; + cachedFiles.size > 0 ? Math.max(...[...cachedFiles].map((file) => file.blobLastModified)) : revisionStat.mtimeMs; cachedRevisions.add({ - commit_hash: dir, + commitHash: dir, files: cachedFiles, refs: refsByHash.get(dir) || new Set(), - size_on_disk: [...cachedFiles].reduce((sum, file) => sum + file.size_on_disk, 0), - snapshot_path: revisionPath, - last_modified: revisionLastModified, + sizeOnDisk: [...cachedFiles].reduce((sum, file) => sum + file.sizeOnDisk, 0), + snapshotPath: revisionPath, + lastModified: revisionLastModified, }); refsByHash.delete(dir); @@ -166,11 +164,11 @@ export async function scan_cached_repo(repo_path: string): Promise 0) { throw new Error( - `Reference(s) refer to missing commit hashes: ${JSON.stringify(Object.fromEntries(refsByHash))} (${repo_path})` + `Reference(s) refer to missing commit hashes: ${JSON.stringify(Object.fromEntries(refsByHash))} (${repoPath})` ); } - const repoStats = await stat(repo_path); + const repoStats = await stat(repoPath); const repoLastAccessed = blobStats.size > 0 ? Math.max(...[...blobStats.values()].map((stat) => stat.atimeMs)) : repoStats.atimeMs; @@ -179,14 +177,14 @@ export async function scan_cached_repo(repo_path: string): Promise sum + stat.size, 0), - last_accessed: repoLastAccessed, - last_modified: repoLastModified, + sizeOnDisk: [...blobStats.values()].reduce((sum, stat) => sum + stat.size, 0), + lastAccessed: repoLastAccessed, + lastModified: repoLastModified, }; } @@ -219,12 +217,12 @@ export async function scanSnapshotDir( const blobStat = await getBlobStat(blobPath, blobStats); cachedFiles.add({ - file_name: file.name, - file_path: filePath, - blob_path: blobPath, - size_on_disk: blobStat.size, - blob_last_accessed: blobStat.atimeMs, - blob_last_modified: blobStat.mtimeMs, + filename: file.name, + filePath: filePath, + blobPath: blobPath, + sizeOnDisk: blobStat.size, + blobLastAccessed: blobStat.atimeMs, + blobLastModified: blobStat.mtimeMs, }); } } From fa170b94803c98d8e3d721fc529016b07ee6be00 Mon Sep 17 00:00:00 2001 From: axel7083 <42176370+axel7083@users.noreply.github.com> Date: Tue, 17 Sep 2024 18:23:30 +0200 Subject: [PATCH 05/14] fix: adding cache-management to index.ts --- packages/hub/src/lib/index.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/hub/src/lib/index.ts b/packages/hub/src/lib/index.ts index 554977f02..4630d1764 100644 --- a/packages/hub/src/lib/index.ts +++ b/packages/hub/src/lib/index.ts @@ -1,3 +1,4 @@ +export * from "./cache-management"; export * from "./commit"; export * from "./count-commits"; export * from "./create-repo"; From fde3deeb2a736db13ac2fff66abbe39394493b88 Mon Sep 17 00:00:00 2001 From: axel7083 <42176370+axel7083@users.noreply.github.com> Date: Wed, 18 Sep 2024 11:18:40 +0200 Subject: [PATCH 06/14] fix(browser): exclude cache management --- packages/hub/package.json | 5 +++-- packages/hub/vitest-browser.config.mts | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/packages/hub/package.json b/packages/hub/package.json index 73f8d3214..388ba648c 100644 --- a/packages/hub/package.json +++ b/packages/hub/package.json @@ -20,11 +20,12 @@ "browser": { "./src/utils/sha256-node.ts": false, "./src/utils/FileBlob.ts": false, + "./src/lib/cache-management.ts": false, "./dist/index.js": "./dist/browser/index.js", "./dist/index.mjs": "./dist/browser/index.mjs" }, "engines": { - "node": ">=18" + "node": ">=20" }, "source": "index.ts", "scripts": { @@ -56,7 +57,7 @@ "author": "Hugging Face", "license": "MIT", "devDependencies": { - "@types/node": "^20.11.28" + "@types/node": "^20.12.8" }, "dependencies": { "@huggingface/tasks": "workspace:^" diff --git a/packages/hub/vitest-browser.config.mts b/packages/hub/vitest-browser.config.mts index 65be77c7a..e106a2fba 100644 --- a/packages/hub/vitest-browser.config.mts +++ b/packages/hub/vitest-browser.config.mts @@ -2,6 +2,6 @@ import { configDefaults, defineConfig } from "vitest/config"; export default defineConfig({ test: { - exclude: [...configDefaults.exclude, "src/utils/FileBlob.spec.ts"], + exclude: [...configDefaults.exclude, "src/utils/FileBlob.spec.ts", "src/lib/cache-management.spec.ts"], }, }); From 7a5340797b0a5aefc15cefd109217ccd708112d8 Mon Sep 17 00:00:00 2001 From: axel7083 <42176370+axel7083@users.noreply.github.com> Date: Wed, 18 Sep 2024 11:22:00 +0200 Subject: [PATCH 07/14] revert: useless changes --- packages/hub/package.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/hub/package.json b/packages/hub/package.json index 388ba648c..c79f9d650 100644 --- a/packages/hub/package.json +++ b/packages/hub/package.json @@ -25,7 +25,7 @@ "./dist/index.mjs": "./dist/browser/index.mjs" }, "engines": { - "node": ">=20" + "node": ">=18" }, "source": "index.ts", "scripts": { @@ -57,7 +57,7 @@ "author": "Hugging Face", "license": "MIT", "devDependencies": { - "@types/node": "^20.12.8" + "@types/node": "^20.11.28" }, "dependencies": { "@huggingface/tasks": "workspace:^" From 77f219840b66597def0d2c4e1508a049278f4e34 Mon Sep 17 00:00:00 2001 From: axel7083 <42176370+axel7083@users.noreply.github.com> Date: Thu, 19 Sep 2024 10:24:16 +0200 Subject: [PATCH 08/14] fix: avoid homedir call on import --- packages/hub/src/lib/cache-management.ts | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/packages/hub/src/lib/cache-management.ts b/packages/hub/src/lib/cache-management.ts index 94cb758d4..ffe4ae399 100644 --- a/packages/hub/src/lib/cache-management.ts +++ b/packages/hub/src/lib/cache-management.ts @@ -3,16 +3,21 @@ import { join, basename } from "node:path"; import { stat, readdir, readFile, realpath, lstat } from "node:fs/promises"; import type { Stats } from "node:fs"; -const default_home = join(homedir(), ".cache"); -export const HF_HOME: string = - process.env["HF_HOME"] ?? join(process.env["XDG_CACHE_HOME"] ?? default_home, "huggingface"); +function getDefaultHome(): string { + return join(homedir(), ".cache"); +} -const default_cache_path = join(HF_HOME, "hub"); +function getDefaultCachePath(): string { + return process.env["HF_HOME"] ?? join(process.env["XDG_CACHE_HOME"] ?? getDefaultHome(), "huggingface") +} -// Legacy env variable -export const HUGGINGFACE_HUB_CACHE = process.env["HUGGINGFACE_HUB_CACHE"] ?? default_cache_path; -// New env variable -export const HF_HUB_CACHE = process.env["HF_HUB_CACHE"] ?? HUGGINGFACE_HUB_CACHE; +function getHuggingFaceHubCache(): string { + return process.env["HUGGINGFACE_HUB_CACHE"] ?? getDefaultCachePath(); +} + +function getHFHubCache(): string { + return process.env["HF_HUB_CACHE"] ?? getHuggingFaceHubCache(); +} const FILES_TO_IGNORE: string[] = [".DS_Store"]; @@ -61,7 +66,7 @@ export interface HFCacheInfo { } export async function scanCacheDir(cacheDir: string | undefined = undefined): Promise { - if (!cacheDir) cacheDir = HF_HUB_CACHE; + if (!cacheDir) cacheDir = getHFHubCache(); const s = await stat(cacheDir); if (!s.isDirectory()) { From 53c81ff01ea72ae61333a8753a46a2b452459239 Mon Sep 17 00:00:00 2001 From: axel7083 <42176370+axel7083@users.noreply.github.com> Date: Wed, 25 Sep 2024 14:42:50 +0200 Subject: [PATCH 09/14] fix: apply @coyotte508 code suggestion --- packages/hub/README.md | 12 ++ packages/hub/src/lib/cache-management.spec.ts | 21 ++-- packages/hub/src/lib/cache-management.ts | 115 +++++++++--------- 3 files changed, 84 insertions(+), 64 deletions(-) diff --git a/packages/hub/README.md b/packages/hub/README.md index 22a9db256..7744ee5e9 100644 --- a/packages/hub/README.md +++ b/packages/hub/README.md @@ -111,6 +111,18 @@ console.log(oauthResult); Checkout the demo: https://huggingface.co/spaces/huggingfacejs/client-side-oauth +## Hugging face cache + +The `@huggingface/hub` package provide basic capabilities to scan the cache directory. Learn more about [Manage huggingface_hub cache-system](https://huggingface.co/docs/huggingface_hub/en/guides/manage-cache). + +```ts +import { scanCacheDir } from "@huggingface/hub"; + +const result = await scanCacheDir(); + +console.log(result); +``` + ## Performance considerations When uploading large files, you may want to run the `commit` calls inside a worker, to offload the sha256 computations. diff --git a/packages/hub/src/lib/cache-management.spec.ts b/packages/hub/src/lib/cache-management.spec.ts index a8fef9cf2..32203e688 100644 --- a/packages/hub/src/lib/cache-management.spec.ts +++ b/packages/hub/src/lib/cache-management.spec.ts @@ -2,7 +2,6 @@ import { describe, test, expect, vi, beforeEach } from "vitest"; import { scanCacheDir, scanCachedRepo, - REPO_TYPE_T, scanSnapshotDir, parseRepoType, getBlobStat, @@ -43,8 +42,8 @@ describe("scanCacheDir", () => { expect(readdir).toHaveBeenCalledWith("/fake/dir"); expect(result.warnings.length).toBe(0); - expect(result.repos.size).toBe(0); - expect(result.sizeOnDisk).toBe(0); + expect(result.repos).toHaveLength(0); + expect(result.size).toBe(0); }); }); @@ -84,7 +83,7 @@ describe("scanCachedRepo", () => { describe("scanSnapshotDir", () => { test("should scan a valid snapshot directory", async () => { - const cachedFiles = new Set(); + const cachedFiles: CachedFileInfo[] = []; const blobStats = new Map(); vi.mocked(readdir).mockResolvedValueOnce([{ name: "file1", isDirectory: () => false } as Dirent]); @@ -93,7 +92,7 @@ describe("scanSnapshotDir", () => { await scanSnapshotDir("/fake/revision", cachedFiles, blobStats); - expect(cachedFiles.size).toBe(1); + expect(cachedFiles).toHaveLength(1); expect(blobStats.size).toBe(1); }); }); @@ -120,18 +119,22 @@ describe("getBlobStat", () => { describe("parseRepoType", () => { test("should parse models repo type", () => { - expect(parseRepoType("models")).toBe(REPO_TYPE_T.MODEL); + expect(parseRepoType("models")).toBe("model"); }); test("should parse model repo type", () => { - expect(parseRepoType("model")).toBe(REPO_TYPE_T.MODEL); + expect(parseRepoType("model")).toBe("model"); }); test("should parse dataset repo type", () => { - expect(parseRepoType("dataset")).toBe(REPO_TYPE_T.DATASET); + expect(parseRepoType("dataset")).toBe("dataset"); + }); + + test("should parse space repo type", () => { + expect(parseRepoType("space")).toBe("space"); }); test("should throw an error for invalid repo type", () => { - expect(() => parseRepoType("invalid")).toThrow(); + expect(() => parseRepoType("invalid")).toThrowError("Invalid repo type: invalid"); }); }); diff --git a/packages/hub/src/lib/cache-management.ts b/packages/hub/src/lib/cache-management.ts index ffe4ae399..5b865a8f0 100644 --- a/packages/hub/src/lib/cache-management.ts +++ b/packages/hub/src/lib/cache-management.ts @@ -2,31 +2,26 @@ import { homedir } from "node:os"; import { join, basename } from "node:path"; import { stat, readdir, readFile, realpath, lstat } from "node:fs/promises"; import type { Stats } from "node:fs"; +import type { RepoType } from "../types/public"; function getDefaultHome(): string { return join(homedir(), ".cache"); } function getDefaultCachePath(): string { - return process.env["HF_HOME"] ?? join(process.env["XDG_CACHE_HOME"] ?? getDefaultHome(), "huggingface") + return join(process.env["HF_HOME"] ?? join(process.env["XDG_CACHE_HOME"] ?? getDefaultHome(), "huggingface"), "hub"); } function getHuggingFaceHubCache(): string { - return process.env["HUGGINGFACE_HUB_CACHE"] ?? getDefaultCachePath(); + return process.env["HUGGINGFACE_HUB_CACHE"] ?? getDefaultCachePath(); } function getHFHubCache(): string { - return process.env["HF_HUB_CACHE"] ?? getHuggingFaceHubCache(); + return process.env["HF_HUB_CACHE"] ?? getHuggingFaceHubCache(); } const FILES_TO_IGNORE: string[] = [".DS_Store"]; -export enum REPO_TYPE_T { - MODEL = "model", - DATASET = "dataset", - SPACE = "space", -} - export interface CachedFileInfo { filename: string; filePath: string; @@ -38,30 +33,30 @@ export interface CachedFileInfo { } export interface CachedRevisionInfo { - commitHash: string; - snapshotPath: string; - sizeOnDisk: number; - readonly files: Set; - readonly refs: Set; + commitOid: string; + path: string; + size: number; + files: CachedFileInfo[]; + refs: string[]; - lastModified: number; + lastModifiedAt: Date; } export interface CachedRepoInfo { repoId: string; - repoType: REPO_TYPE_T; - repoPath: string; - sizeOnDisk: number; - nbFiles: number; - readonly revisions: Set; - - lastAccessed: number; - lastModified: number; + repoType: RepoType; + path: string; + size: number; + filesCount: number; + revisions: CachedRevisionInfo[]; + + lastAccessedAt: Date; + lastModifiedAt: Date; } export interface HFCacheInfo { - sizeOnDisk: number; - readonly repos: Set; + size: number; + repos: CachedRepoInfo[]; warnings: Error[]; } @@ -75,7 +70,7 @@ export async function scanCacheDir(cacheDir: string | undefined = undefined): Pr ); } - const repos = new Set(); + const repos: CachedRepoInfo[] = []; const warnings: Error[] = []; const directories = await readdir(cacheDir); @@ -94,7 +89,7 @@ export async function scanCacheDir(cacheDir: string | undefined = undefined): Pr try { const cached = await scanCachedRepo(absolute); - repos.add(cached); + repos.push(cached); } catch (err: unknown) { warnings.push(err as Error); } @@ -102,7 +97,7 @@ export async function scanCacheDir(cacheDir: string | undefined = undefined): Pr return { repos: repos, - sizeOnDisk: [...repos.values()].reduce((sum, repo) => sum + repo.sizeOnDisk, 0), + size: [...repos.values()].reduce((sum, repo) => sum + repo.size, 0), warnings: warnings, }; } @@ -128,14 +123,14 @@ export async function scanCachedRepo(repoPath: string): Promise } // Check if the refs directory exists and scan it - const refsByHash: Map> = new Map(); + const refsByHash: Map = new Map(); const refsStat = await stat(refsPath); if (refsStat.isDirectory()) { await scanRefsDir(refsPath, refsByHash); } // Scan snapshots directory and collect cached revision information - const cachedRevisions: Set = new Set(); + const cachedRevisions: CachedRevisionInfo[] = []; const blobStats: Map = new Map(); // Store blob stats const snapshotDirs = await readdir(snapshotsPath); @@ -148,19 +143,21 @@ export async function scanCachedRepo(repoPath: string): Promise throw new Error(`Snapshots folder corrupted. Found a file: ${revisionPath}`); } - const cachedFiles: Set = new Set(); + const cachedFiles: CachedFileInfo[] = []; await scanSnapshotDir(revisionPath, cachedFiles, blobStats); const revisionLastModified = - cachedFiles.size > 0 ? Math.max(...[...cachedFiles].map((file) => file.blobLastModified)) : revisionStat.mtimeMs; + cachedFiles.length > 0 + ? Math.max(...[...cachedFiles].map((file) => file.blobLastModified)) + : revisionStat.mtimeMs; - cachedRevisions.add({ - commitHash: dir, + cachedRevisions.push({ + commitOid: dir, files: cachedFiles, - refs: refsByHash.get(dir) || new Set(), - sizeOnDisk: [...cachedFiles].reduce((sum, file) => sum + file.sizeOnDisk, 0), - snapshotPath: revisionPath, - lastModified: revisionLastModified, + refs: refsByHash.get(dir) || [], + size: [...cachedFiles].reduce((sum, file) => sum + file.sizeOnDisk, 0), + path: revisionPath, + lastModifiedAt: new Date(revisionLastModified), }); refsByHash.delete(dir); @@ -184,16 +181,16 @@ export async function scanCachedRepo(repoPath: string): Promise return { repoId: repoId, repoType: repoType, - repoPath: repoPath, - nbFiles: blobStats.size, + path: repoPath, + filesCount: blobStats.size, revisions: cachedRevisions, - sizeOnDisk: [...blobStats.values()].reduce((sum, stat) => sum + stat.size, 0), - lastAccessed: repoLastAccessed, - lastModified: repoLastModified, + size: [...blobStats.values()].reduce((sum, stat) => sum + stat.size, 0), + lastAccessedAt: new Date(repoLastAccessed), + lastModifiedAt: new Date(repoLastModified), }; } -export async function scanRefsDir(refsPath: string, refsByHash: Map>): Promise { +export async function scanRefsDir(refsPath: string, refsByHash: Map): Promise { const refFiles = await readdir(refsPath, { withFileTypes: true }); for (const refFile of refFiles) { const refFilePath = join(refsPath, refFile.name); @@ -202,15 +199,15 @@ export async function scanRefsDir(refsPath: string, refsByHash: Map, + cachedFiles: CachedFileInfo[], blobStats: Map ): Promise { const files = await readdir(revisionPath, { withFileTypes: true }); @@ -221,7 +218,7 @@ export async function scanSnapshotDir( const blobPath = await realpath(filePath); const blobStat = await getBlobStat(blobPath, blobStats); - cachedFiles.add({ + cachedFiles.push({ filename: file.name, filePath: filePath, blobPath: blobPath, @@ -242,16 +239,24 @@ export async function getBlobStat(blobPath: string, blobStats: Map { + console.log(result); + }) + .catch((err: unknown) => { + console.error(err); + }); From 0f33684a1992d51210bac60dcff5b9a26dbc7ee8 Mon Sep 17 00:00:00 2001 From: axel7083 <42176370+axel7083@users.noreply.github.com> Date: Wed, 25 Sep 2024 14:46:21 +0200 Subject: [PATCH 10/14] fix: blob property type --- packages/hub/src/lib/cache-management.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/packages/hub/src/lib/cache-management.ts b/packages/hub/src/lib/cache-management.ts index 5b865a8f0..726c11471 100644 --- a/packages/hub/src/lib/cache-management.ts +++ b/packages/hub/src/lib/cache-management.ts @@ -28,8 +28,8 @@ export interface CachedFileInfo { blobPath: string; sizeOnDisk: number; - blobLastAccessed: number; - blobLastModified: number; + blobLastAccessedAt: Date; + blobLastModifiedAt: Date; } export interface CachedRevisionInfo { @@ -148,7 +148,7 @@ export async function scanCachedRepo(repoPath: string): Promise const revisionLastModified = cachedFiles.length > 0 - ? Math.max(...[...cachedFiles].map((file) => file.blobLastModified)) + ? Math.max(...[...cachedFiles].map((file) => file.blobLastModifiedAt.getTime())) : revisionStat.mtimeMs; cachedRevisions.push({ @@ -223,8 +223,8 @@ export async function scanSnapshotDir( filePath: filePath, blobPath: blobPath, sizeOnDisk: blobStat.size, - blobLastAccessed: blobStat.atimeMs, - blobLastModified: blobStat.mtimeMs, + blobLastAccessedAt: new Date(blobStat.atimeMs), + blobLastModifiedAt: new Date(blobStat.mtimeMs), }); } } From 25b4c8a1e1c313e78f3a6498214c452375466b04 Mon Sep 17 00:00:00 2001 From: axel7083 <42176370+axel7083@users.noreply.github.com> Date: Wed, 25 Sep 2024 15:40:15 +0200 Subject: [PATCH 11/14] fix: missing @coyotte508 comment --- packages/hub/src/lib/cache-management.spec.ts | 3 ++- packages/hub/src/lib/cache-management.ts | 11 ++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/packages/hub/src/lib/cache-management.spec.ts b/packages/hub/src/lib/cache-management.spec.ts index 32203e688..3655bab1a 100644 --- a/packages/hub/src/lib/cache-management.spec.ts +++ b/packages/hub/src/lib/cache-management.spec.ts @@ -77,7 +77,8 @@ describe("scanCachedRepo", () => { withFileTypes: true, }); - expect(result.repoId).toBe("hello-world/name"); + expect(result.id.name).toBe("hello-world/name"); + expect(result.id.type).toBe("model"); }); }); diff --git a/packages/hub/src/lib/cache-management.ts b/packages/hub/src/lib/cache-management.ts index 726c11471..5c3cc5e11 100644 --- a/packages/hub/src/lib/cache-management.ts +++ b/packages/hub/src/lib/cache-management.ts @@ -2,7 +2,7 @@ import { homedir } from "node:os"; import { join, basename } from "node:path"; import { stat, readdir, readFile, realpath, lstat } from "node:fs/promises"; import type { Stats } from "node:fs"; -import type { RepoType } from "../types/public"; +import type { RepoType, RepoId } from "../types/public"; function getDefaultHome(): string { return join(homedir(), ".cache"); @@ -43,8 +43,7 @@ export interface CachedRevisionInfo { } export interface CachedRepoInfo { - repoId: string; - repoType: RepoType; + id: RepoId path: string; size: number; filesCount: number; @@ -179,8 +178,10 @@ export async function scanCachedRepo(repoPath: string): Promise // Return the constructed CachedRepoInfo object return { - repoId: repoId, - repoType: repoType, + id: { + name: repoId, + type: repoType, + }, path: repoPath, filesCount: blobStats.size, revisions: cachedRevisions, From f264a8172ec01c6b73bd9bf45f51dbda57aa1f95 Mon Sep 17 00:00:00 2001 From: axel7083 <42176370+axel7083@users.noreply.github.com> Date: Thu, 26 Sep 2024 10:55:58 +0200 Subject: [PATCH 12/14] fix: refactor CachedFileInfo --- packages/hub/src/lib/cache-management.ts | 44 +++++++++++------------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/packages/hub/src/lib/cache-management.ts b/packages/hub/src/lib/cache-management.ts index 5c3cc5e11..51e7596ae 100644 --- a/packages/hub/src/lib/cache-management.ts +++ b/packages/hub/src/lib/cache-management.ts @@ -23,13 +23,16 @@ function getHFHubCache(): string { const FILES_TO_IGNORE: string[] = [".DS_Store"]; export interface CachedFileInfo { - filename: string; - filePath: string; - blobPath: string; - sizeOnDisk: number; - - blobLastAccessedAt: Date; - blobLastModifiedAt: Date; + path: string; + /** + * Underlying file - which `path` is symlinked to + */ + blob: { + size: number; + path: string; + lastModifiedAt: Date; + lastAccessedAt: Date; + }; } export interface CachedRevisionInfo { @@ -43,7 +46,7 @@ export interface CachedRevisionInfo { } export interface CachedRepoInfo { - id: RepoId + id: RepoId; path: string; size: number; filesCount: number; @@ -147,14 +150,14 @@ export async function scanCachedRepo(repoPath: string): Promise const revisionLastModified = cachedFiles.length > 0 - ? Math.max(...[...cachedFiles].map((file) => file.blobLastModifiedAt.getTime())) + ? Math.max(...[...cachedFiles].map((file) => file.blob.lastModifiedAt.getTime())) : revisionStat.mtimeMs; cachedRevisions.push({ commitOid: dir, files: cachedFiles, refs: refsByHash.get(dir) || [], - size: [...cachedFiles].reduce((sum, file) => sum + file.sizeOnDisk, 0), + size: [...cachedFiles].reduce((sum, file) => sum + file.blob.size, 0), path: revisionPath, lastModifiedAt: new Date(revisionLastModified), }); @@ -220,12 +223,13 @@ export async function scanSnapshotDir( const blobStat = await getBlobStat(blobPath, blobStats); cachedFiles.push({ - filename: file.name, - filePath: filePath, - blobPath: blobPath, - sizeOnDisk: blobStat.size, - blobLastAccessedAt: new Date(blobStat.atimeMs), - blobLastModifiedAt: new Date(blobStat.mtimeMs), + path: filePath, + blob: { + path: blobPath, + size: blobStat.size, + lastAccessedAt: new Date(blobStat.atimeMs), + lastModifiedAt: new Date(blobStat.mtimeMs), + }, }); } } @@ -253,11 +257,3 @@ export function parseRepoType(type: string): RepoType { throw new TypeError(`Invalid repo type: ${type}`); } } - -scanCacheDir() - .then((result) => { - console.log(result); - }) - .catch((err: unknown) => { - console.error(err); - }); From 76a8ec7c0b3980fca23de2815f4b60a4b339fda6 Mon Sep 17 00:00:00 2001 From: axel7083 <42176370+axel7083@users.noreply.github.com> Date: Thu, 26 Sep 2024 15:14:46 +0200 Subject: [PATCH 13/14] Apply suggestions from @Wauplin Co-authored-by: Lucain --- packages/hub/README.md | 1 + packages/hub/src/lib/cache-management.ts | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/hub/README.md b/packages/hub/README.md index 7744ee5e9..a4a570bfe 100644 --- a/packages/hub/README.md +++ b/packages/hub/README.md @@ -122,6 +122,7 @@ const result = await scanCacheDir(); console.log(result); ``` +Note that the cache directory is created and used only by the Python and Rust libraries. Downloading files using the `@huggingface/hub` package won't use the cache directory. ## Performance considerations diff --git a/packages/hub/src/lib/cache-management.ts b/packages/hub/src/lib/cache-management.ts index 51e7596ae..aecbf271e 100644 --- a/packages/hub/src/lib/cache-management.ts +++ b/packages/hub/src/lib/cache-management.ts @@ -247,11 +247,10 @@ export async function getBlobStat(blobPath: string, blobStats: Map Date: Thu, 26 Sep 2024 15:16:26 +0200 Subject: [PATCH 14/14] fix: unit tests --- packages/hub/src/lib/cache-management.spec.ts | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/packages/hub/src/lib/cache-management.spec.ts b/packages/hub/src/lib/cache-management.spec.ts index 3655bab1a..3bfed63d9 100644 --- a/packages/hub/src/lib/cache-management.spec.ts +++ b/packages/hub/src/lib/cache-management.spec.ts @@ -123,16 +123,12 @@ describe("parseRepoType", () => { expect(parseRepoType("models")).toBe("model"); }); - test("should parse model repo type", () => { - expect(parseRepoType("model")).toBe("model"); - }); - test("should parse dataset repo type", () => { - expect(parseRepoType("dataset")).toBe("dataset"); + expect(parseRepoType("datasets")).toBe("dataset"); }); test("should parse space repo type", () => { - expect(parseRepoType("space")).toBe("space"); + expect(parseRepoType("spaces")).toBe("space"); }); test("should throw an error for invalid repo type", () => {