Skip to content

Commit

Permalink
refactor: migrate reader type into core (#1111)
Browse files Browse the repository at this point in the history
  • Loading branch information
himself65 authored Aug 9, 2024
1 parent 01c184c commit 089f1d4
Show file tree
Hide file tree
Showing 19 changed files with 141 additions and 159 deletions.
2 changes: 1 addition & 1 deletion packages/core/src/schema/index.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
export * from "./node";
export { TransformComponent } from "./type";
export { FileReader, TransformComponent, type BaseReader } from "./type";
export { EngineResponse } from "./type/engine–response";
export * from "./zod";
38 changes: 36 additions & 2 deletions packages/core/src/schema/type.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { randomUUID } from "@llamaindex/env";
import type { BaseNode } from "./node";
import { fs, path, randomUUID } from "@llamaindex/env";
import type { BaseNode, Document } from "./node";

interface TransformComponentSignature {
<Options extends Record<string, unknown>>(
Expand Down Expand Up @@ -28,3 +28,37 @@ export class TransformComponent {
return transform;
}
}

/**
* A reader takes imports data into Document objects.
*/
export interface BaseReader {
loadData(...args: unknown[]): Promise<Document[]>;
}

/**
* A FileReader takes file paths and imports data into Document objects.
*/
export abstract class FileReader implements BaseReader {
abstract loadDataAsContent(
fileContent: Uint8Array,
fileName?: string,
): Promise<Document[]>;

async loadData(filePath: string): Promise<Document[]> {
const fileContent = await fs.readFile(filePath);
const fileName = path.basename(filePath);
const docs = await this.loadDataAsContent(fileContent, fileName);
docs.forEach(FileReader.addMetaData(filePath));
return docs;
}

static addMetaData(filePath: string) {
return (doc: Document, index: number) => {
// generate id as loadDataAsContent is only responsible for the content
doc.id_ = `${filePath}_${index + 1}`;
doc.metadata["file_path"] = path.resolve(filePath);
doc.metadata["file_name"] = path.basename(filePath);
};
}
}
4 changes: 2 additions & 2 deletions packages/llamaindex/src/ingestion/IngestionPipeline.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import type { TransformComponent } from "@llamaindex/core/schema";
import type { BaseReader, TransformComponent } from "@llamaindex/core/schema";
import {
ModalityType,
splitNodesByType,
type BaseNode,
type Document,
type Metadata,
} from "@llamaindex/core/schema";
import type { BaseReader } from "../readers/type.js";
import type { BaseDocumentStore } from "../storage/docStore/types.js";
import type {
VectorStore,
Expand Down Expand Up @@ -107,6 +106,7 @@ export class IngestionPipeline {
inputNodes.push(this.documents);
}
if (this.reader) {
// fixme: empty parameter might cause error
inputNodes.push(await this.reader.loadData());
}
return inputNodes.flat();
Expand Down
3 changes: 1 addition & 2 deletions packages/llamaindex/src/readers/AssemblyAIReader.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { Document } from "@llamaindex/core/schema";
import { type BaseReader, Document } from "@llamaindex/core/schema";
import { getEnv } from "@llamaindex/env";
import type {
BaseServiceParams,
Expand All @@ -8,7 +8,6 @@ import type {
TranscriptSentence,
} from "assemblyai";
import { AssemblyAI } from "assemblyai";
import type { BaseReader } from "./type.js";

type AssemblyAIOptions = Partial<BaseServiceParams>;
const defaultOptions = {
Expand Down
3 changes: 1 addition & 2 deletions packages/llamaindex/src/readers/CSVReader.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import { Document } from "@llamaindex/core/schema";
import { type BaseReader, Document, FileReader } from "@llamaindex/core/schema";
import type { ParseConfig } from "papaparse";
import Papa from "papaparse";
import { FileReader } from "./type.js";

/**
* papaparse-based csv parser
Expand Down
4 changes: 2 additions & 2 deletions packages/llamaindex/src/readers/DiscordReader.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import { REST, type RESTOptions } from "@discordjs/rest";
import { Document } from "@llamaindex/core/schema";
import { Document, type BaseReader } from "@llamaindex/core/schema";
import { getEnv } from "@llamaindex/env";
import { Routes, type APIEmbed, type APIMessage } from "discord-api-types/v10";

/**
* Represents a reader for Discord messages using @discordjs/rest
* See https://github.com/discordjs/discord.js/tree/main/packages/rest
*/
export class DiscordReader {
export class DiscordReader implements BaseReader {
private client: REST;

constructor(
Expand Down
3 changes: 1 addition & 2 deletions packages/llamaindex/src/readers/DocxReader.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import { Document } from "@llamaindex/core/schema";
import { Document, FileReader } from "@llamaindex/core/schema";
import mammoth from "mammoth";
import { FileReader } from "./type.js";

export class DocxReader extends FileReader {
/** DocxParser */
Expand Down
4 changes: 1 addition & 3 deletions packages/llamaindex/src/readers/HTMLReader.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import { Document } from "@llamaindex/core/schema";
import { FileReader } from "./type.js";

import { Document, FileReader } from "@llamaindex/core/schema";
/**
* Extract the significant text from an arbitrary HTML document.
* The contents of any head, script, style, and xml tags are removed completely.
Expand Down
3 changes: 1 addition & 2 deletions packages/llamaindex/src/readers/ImageReader.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import type { Document } from "@llamaindex/core/schema";
import { ImageDocument } from "@llamaindex/core/schema";
import { FileReader } from "./type.js";
import { FileReader, ImageDocument } from "@llamaindex/core/schema";

/**
* Reads the content of an image file into a Document object (which stores the image file as a Blob).
Expand Down
4 changes: 1 addition & 3 deletions packages/llamaindex/src/readers/JSONReader.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import type { JSONValue } from "@llamaindex/core/global";
import { Document } from "@llamaindex/core/schema";
import { FileReader } from "./type.js";

import { Document, FileReader } from "@llamaindex/core/schema";
export interface JSONReaderOptions {
/**
* Whether to ensure only ASCII characters.
Expand Down
89 changes: 87 additions & 2 deletions packages/llamaindex/src/readers/LlamaParseReader.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,92 @@
import { Document } from "@llamaindex/core/schema";
import { Document, FileReader } from "@llamaindex/core/schema";
import { fs, getEnv } from "@llamaindex/env";
import { filetypeinfo } from "magic-bytes.js";
import { FileReader, type Language, type ResultType } from "./type.js";

export type ResultType = "text" | "markdown" | "json";
export type Language =
| "abq"
| "ady"
| "af"
| "ang"
| "ar"
| "as"
| "ava"
| "az"
| "be"
| "bg"
| "bh"
| "bho"
| "bn"
| "bs"
| "ch_sim"
| "ch_tra"
| "che"
| "cs"
| "cy"
| "da"
| "dar"
| "de"
| "en"
| "es"
| "et"
| "fa"
| "fr"
| "ga"
| "gom"
| "hi"
| "hr"
| "hu"
| "id"
| "inh"
| "is"
| "it"
| "ja"
| "kbd"
| "kn"
| "ko"
| "ku"
| "la"
| "lbe"
| "lez"
| "lt"
| "lv"
| "mah"
| "mai"
| "mi"
| "mn"
| "mr"
| "ms"
| "mt"
| "ne"
| "new"
| "nl"
| "no"
| "oc"
| "pi"
| "pl"
| "pt"
| "ro"
| "ru"
| "rs_cyrillic"
| "rs_latin"
| "sck"
| "sk"
| "sl"
| "sq"
| "sv"
| "sw"
| "ta"
| "tab"
| "te"
| "th"
| "tjk"
| "tl"
| "tr"
| "ug"
| "uk"
| "ur"
| "uz"
| "vi";

const SUPPORT_FILE_EXT: string[] = [
".pdf",
Expand Down
3 changes: 1 addition & 2 deletions packages/llamaindex/src/readers/MarkdownReader.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import { Document } from "@llamaindex/core/schema";
import { FileReader } from "./type.js";
import { Document, FileReader } from "@llamaindex/core/schema";

type MarkdownTuple = [string | null, string];

Expand Down
2 changes: 1 addition & 1 deletion packages/llamaindex/src/readers/NotionReader.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import type { BaseReader } from "@llamaindex/core/schema";
import { Document } from "@llamaindex/core/schema";
import type { Crawler, CrawlerOptions, Page } from "notion-md-crawler";
import { crawler, pageToString } from "notion-md-crawler";
import type { BaseReader } from "./type.js";

type NotionReaderOptions = Pick<CrawlerOptions, "client" | "serializers">;

Expand Down
3 changes: 1 addition & 2 deletions packages/llamaindex/src/readers/PDFReader.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import { Document } from "@llamaindex/core/schema";
import { FileReader } from "./type.js";
import { Document, FileReader } from "@llamaindex/core/schema";

/**
* Read the text of a PDF
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import type { BaseReader, FileReader } from "@llamaindex/core/schema";
import { Document } from "@llamaindex/core/schema";
import { path } from "@llamaindex/env";
import { walk } from "../storage/FileSystem.js";
import { TextFileReader } from "./TextFileReader.js";
import type { BaseReader, FileReader } from "./type.js";
import pLimit from "./utils.js";

type ReaderCallback = (
Expand Down
2 changes: 1 addition & 1 deletion packages/llamaindex/src/readers/SimpleDirectoryReader.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import type { FileReader } from "@llamaindex/core/schema";
import { Document } from "@llamaindex/core/schema";
import { PapaCSVReader } from "./CSVReader.js";
import { DocxReader } from "./DocxReader.js";
Expand All @@ -10,7 +11,6 @@ import {
type SimpleDirectoryReaderLoadDataParams,
} from "./SimpleDirectoryReader.edge.js";
import { TextFileReader } from "./TextFileReader.js";
import type { FileReader } from "./type.js";

export const FILE_EXT_TO_READER: Record<string, FileReader> = {
txt: new TextFileReader(),
Expand Down
3 changes: 1 addition & 2 deletions packages/llamaindex/src/readers/SimpleMongoReader.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import type { Metadata } from "@llamaindex/core/schema";
import { Document } from "@llamaindex/core/schema";
import { type BaseReader, Document } from "@llamaindex/core/schema";
import type { MongoClient } from "mongodb";
import type { BaseReader } from "./type.js";

/**
* Read in from MongoDB
Expand Down
4 changes: 1 addition & 3 deletions packages/llamaindex/src/readers/TextFileReader.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import { Document } from "@llamaindex/core/schema";
import { FileReader } from "./type.js";

import { Document, FileReader } from "@llamaindex/core/schema";
/**
* Read a .txt file
*/
Expand Down
Loading

0 comments on commit 089f1d4

Please sign in to comment.