From 885a7d1cdcff14b8a20653d92b073e93d2b9ca6b Mon Sep 17 00:00:00 2001
From: shaoyijia <shaoyj.echo@gmail.com>
Date: Mon, 15 Jul 2024 20:59:10 +0800
Subject: [PATCH 01/10] Wrap src into a python package.

- Adjust `src` structure.
- Cleanup requirements.txt.
---
 requirements.txt                              | 20 -------
 setup.py                                      | 40 ++++++++++++++
 src/knowledge_storm/__init__.py               |  5 ++
 src/{ => knowledge_storm}/interface.py        |  0
 src/{ => knowledge_storm}/lm.py               |  0
 src/{ => knowledge_storm}/rm.py               |  4 +-
 .../storm_wiki/__init__.py                    |  0
 .../storm_wiki/engine.py                      | 54 +++++++++++--------
 .../storm_wiki/modules/__init__.py            |  2 +-
 .../storm_wiki/modules/article_generation.py  |  9 ++--
 .../storm_wiki/modules/article_polish.py      |  7 +--
 .../storm_wiki/modules/callback.py            |  2 +-
 .../storm_wiki/modules/knowledge_curation.py  | 12 +++--
 .../storm_wiki/modules/outline_generation.py  |  9 ++--
 .../storm_wiki/modules/persona_generator.py   |  0
 .../storm_wiki/modules/retriever.py}          | 52 +++++++++++++++---
 .../storm_wiki/modules/storm_dataclass.py     |  5 +-
 src/{ => knowledge_storm}/utils.py            |  0
 src/storm_wiki/modules/retriever.py           | 45 ----------------
 19 files changed, 150 insertions(+), 116 deletions(-)
 create mode 100644 setup.py
 create mode 100644 src/knowledge_storm/__init__.py
 rename src/{ => knowledge_storm}/interface.py (100%)
 rename src/{ => knowledge_storm}/lm.py (100%)
 rename src/{ => knowledge_storm}/rm.py (99%)
 rename src/{ => knowledge_storm}/storm_wiki/__init__.py (100%)
 rename src/{ => knowledge_storm}/storm_wiki/engine.py (85%)
 rename src/{ => knowledge_storm}/storm_wiki/modules/__init__.py (74%)
 rename src/{ => knowledge_storm}/storm_wiki/modules/article_generation.py (96%)
 rename src/{ => knowledge_storm}/storm_wiki/modules/article_polish.py (96%)
 rename src/{ => knowledge_storm}/storm_wiki/modules/callback.py (98%)
 rename src/{ => knowledge_storm}/storm_wiki/modules/knowledge_curation.py (97%)
 rename src/{ => knowledge_storm}/storm_wiki/modules/outline_generation.py (96%)
 rename src/{ => knowledge_storm}/storm_wiki/modules/persona_generator.py (100%)
 rename src/{storm_wiki/modules/internet_source_restrictions.json => knowledge_storm/storm_wiki/modules/retriever.py} (72%)
 rename src/{ => knowledge_storm}/storm_wiki/modules/storm_dataclass.py (99%)
 rename src/{ => knowledge_storm}/utils.py (100%)
 delete mode 100644 src/storm_wiki/modules/retriever.py
diff --git a/requirements.txt b/requirements.txt
index b1bdfa51..8ac1b95b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,27 +1,7 @@
 dspy_ai==2.4.9
-streamlit==1.31.1
 wikipedia==1.4.0
-streamlit_authenticator==0.2.3
-streamlit_oauth==0.1.8
-streamlit-card
-google-cloud==0.34.0
-google-cloud-vision==3.5.0
-google-cloud-storage==2.14.0
 sentence_transformers
 toml
-markdown
-unidecode
-extra-streamlit-components==0.1.60
-google-cloud-firestore==2.14.0
-firebase-admin==6.4.0
-streamlit_extras
-streamlit_cookies_manager
-deprecation==2.1.0
-st-pages==0.4.5
-streamlit-float
-streamlit-option-menu
-sentry-sdk
-pdfkit==1.0.0
 langchain-text-splitters
 trafilatura
 langchain-huggingface
diff --git a/setup.py b/setup.py
new file mode 100644
index 00000000..00eb7a71
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,40 @@
+import re
+
+from setuptools import setup, find_packages
+
+# Read the content of the README file
+with open("README.md", encoding="utf-8") as f:
+    long_description = f.read()
+    # Remove p tags.
+    pattern = re.compile(r'<p.*?>.*?</p>', re.DOTALL)
+    long_description = re.sub(pattern, '', long_description)
+
+# Read the content of the requirements.txt file
+with open("requirements.txt", encoding="utf-8") as f:
+    requirements = f.read().splitlines()
+
+
+setup(
+    name="knowledge-storm",
+    version="0.2.1",
+    author="Yijia Shao, Yucheng Jiang",
+    author_email="shaoyj@stanford.edu, yuchengj@stanford.edu",
+    description="STORM: A language model-powered knowledge curation engine.",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/stanford-oval/storm",
+    license="MIT License",
+    package_dir={"": "src"},
+    packages=find_packages("src"),
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+    ],
+    python_requires='>=3.9',
+    install_requires=requirements,
+)
diff --git a/src/knowledge_storm/__init__.py b/src/knowledge_storm/__init__.py
new file mode 100644
index 00000000..f1fd18ea
--- /dev/null
+++ b/src/knowledge_storm/__init__.py
@@ -0,0 +1,5 @@
+from .storm_wiki.engine import (
+    STORMWikiLMConfigs,
+    STORMWikiRunnerArguments,
+    STORMWikiRunner
+)
diff --git a/src/interface.py b/src/knowledge_storm/interface.py
similarity index 100%
rename from src/interface.py
rename to src/knowledge_storm/interface.py
diff --git a/src/lm.py b/src/knowledge_storm/lm.py
similarity index 100%
rename from src/lm.py
rename to src/knowledge_storm/lm.py
diff --git a/src/rm.py b/src/knowledge_storm/rm.py
similarity index 99%
rename from src/rm.py
rename to src/knowledge_storm/rm.py
index 5126aa5a..86f59703 100644
--- a/src/rm.py
+++ b/src/knowledge_storm/rm.py
@@ -5,13 +5,13 @@
 import dspy
 import pandas as pd
 import requests
-from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_core.documents import Document
+from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_qdrant import Qdrant
 from qdrant_client import QdrantClient, models
 from tqdm import tqdm
 
-from utils import WebPageHelper
+from .utils import WebPageHelper
 
 
 class YouRM(dspy.Retrieve):
diff --git a/src/storm_wiki/__init__.py b/src/knowledge_storm/storm_wiki/__init__.py
similarity index 100%
rename from src/storm_wiki/__init__.py
rename to src/knowledge_storm/storm_wiki/__init__.py
diff --git a/src/storm_wiki/engine.py b/src/knowledge_storm/storm_wiki/engine.py
similarity index 85%
rename from src/storm_wiki/engine.py
rename to src/knowledge_storm/storm_wiki/engine.py
index 4191d35a..e0c8dfcc 100644
--- a/src/storm_wiki/engine.py
+++ b/src/knowledge_storm/storm_wiki/engine.py
@@ -5,17 +5,18 @@
 from typing import Union, Literal, Optional
 
 import dspy
-from interface import Engine, LMConfigs
-from lm import OpenAIModel
-from storm_wiki.modules.article_generation import StormArticleGenerationModule
-from storm_wiki.modules.article_polish import StormArticlePolishingModule
-from storm_wiki.modules.callback import BaseCallbackHandler
-from storm_wiki.modules.knowledge_curation import StormKnowledgeCurationModule
-from storm_wiki.modules.outline_generation import StormOutlineGenerationModule
-from storm_wiki.modules.persona_generator import StormPersonaGenerator
-from storm_wiki.modules.retriever import StormRetriever
-from storm_wiki.modules.storm_dataclass import StormInformationTable, StormArticle
-from utils import FileIOHelper, makeStringRed
+
+from .modules.article_generation import StormArticleGenerationModule
+from .modules.article_polish import StormArticlePolishingModule
+from .modules.callback import BaseCallbackHandler
+from .modules.knowledge_curation import StormKnowledgeCurationModule
+from .modules.outline_generation import StormOutlineGenerationModule
+from .modules.persona_generator import StormPersonaGenerator
+from .modules.retriever import StormRetriever
+from .modules.storm_dataclass import StormInformationTable, StormArticle
+from ..interface import Engine, LMConfigs
+from ..lm import OpenAIModel
+from ..utils import FileIOHelper, makeStringRed
 
 
 class STORMWikiLMConfigs(LMConfigs):
@@ -233,16 +234,20 @@ def post_run(self):
                 f.write(json.dumps(call) + '\n')
 
     def _load_information_table_from_local_fs(self, information_table_local_path):
-        assert os.path.exists(information_table_local_path), makeStringRed(f"{information_table_local_path} not exists. Please set --do-research argument to prepare the conversation_log.json for this topic.")
+        assert os.path.exists(information_table_local_path), makeStringRed(
+            f"{information_table_local_path} not exists. Please set --do-research argument to prepare the conversation_log.json for this topic.")
         return StormInformationTable.from_conversation_log_file(information_table_local_path)
-    
+
     def _load_outline_from_local_fs(self, topic, outline_local_path):
-        assert os.path.exists(outline_local_path), makeStringRed(f"{outline_local_path} not exists. Please set --do-generate-outline argument to prepare the storm_gen_outline.txt for this topic.")
+        assert os.path.exists(outline_local_path), makeStringRed(
+            f"{outline_local_path} not exists. Please set --do-generate-outline argument to prepare the storm_gen_outline.txt for this topic.")
         return StormArticle.from_outline_file(topic=topic, file_path=outline_local_path)
 
     def _load_draft_article_from_local_fs(self, topic, draft_article_path, url_to_info_path):
-        assert os.path.exists(draft_article_path), makeStringRed(f"{draft_article_path} not exists. Please set --do-generate-article argument to prepare the storm_gen_article.txt for this topic.")
-        assert os.path.exists(url_to_info_path), makeStringRed(f"{url_to_info_path} not exists. Please set --do-generate-article argument to prepare the url_to_info.json for this topic.")
+        assert os.path.exists(draft_article_path), makeStringRed(
+            f"{draft_article_path} not exists. Please set --do-generate-article argument to prepare the storm_gen_article.txt for this topic.")
+        assert os.path.exists(url_to_info_path), makeStringRed(
+            f"{url_to_info_path} not exists. Please set --do-generate-article argument to prepare the url_to_info.json for this topic.")
         article_text = FileIOHelper.load_str(draft_article_path)
         references = FileIOHelper.load_json(url_to_info_path)
         return StormArticle.from_string(topic_name=topic, article_text=article_text, references=references)
@@ -274,7 +279,8 @@ def run(self,
             callback_handler: A callback handler to handle the intermediate results.
         """
         assert do_research or do_generate_outline or do_generate_article or do_polish_article, \
-            makeStringRed("No action is specified. Please set at least one of --do-research, --do-generate-outline, --do-generate-article, --do-polish-article")
+            makeStringRed(
+                "No action is specified. Please set at least one of --do-research, --do-generate-outline, --do-generate-article, --do-polish-article")
 
         self.topic = topic
         self.article_dir_name = topic.replace(' ', '_').replace('/', '_')
@@ -291,7 +297,8 @@ def run(self,
         if do_generate_outline:
             # load information table if it's not initialized
             if information_table is None:
-                 information_table = self._load_information_table_from_local_fs(os.path.join(self.article_output_dir, 'conversation_log.json'))
+                information_table = self._load_information_table_from_local_fs(
+                    os.path.join(self.article_output_dir, 'conversation_log.json'))
             outline = self.run_outline_generation_module(information_table=information_table,
                                                          callback_handler=callback_handler)
 
@@ -299,9 +306,12 @@ def run(self,
         draft_article: StormArticle = None
         if do_generate_article:
             if information_table is None:
-                 information_table = self._load_information_table_from_local_fs(os.path.join(self.article_output_dir, 'conversation_log.json'))
+                information_table = self._load_information_table_from_local_fs(
+                    os.path.join(self.article_output_dir, 'conversation_log.json'))
             if outline is None:
-                outline = self._load_outline_from_local_fs(topic=topic, outline_local_path=os.path.join(self.article_output_dir, 'storm_gen_outline.txt'))
+                outline = self._load_outline_from_local_fs(topic=topic,
+                                                           outline_local_path=os.path.join(self.article_output_dir,
+                                                                                           'storm_gen_outline.txt'))
             draft_article = self.run_article_generation_module(outline=outline,
                                                                information_table=information_table,
                                                                callback_handler=callback_handler)
@@ -311,5 +321,7 @@ def run(self,
             if draft_article is None:
                 draft_article_path = os.path.join(self.article_output_dir, 'storm_gen_article.txt')
                 url_to_info_path = os.path.join(self.article_output_dir, 'url_to_info.json')
-                draft_article =  self._load_draft_article_from_local_fs(topic=topic, draft_article_path=draft_article_path, url_to_info_path=url_to_info_path)
+                draft_article = self._load_draft_article_from_local_fs(topic=topic,
+                                                                       draft_article_path=draft_article_path,
+                                                                       url_to_info_path=url_to_info_path)
             self.run_article_polishing_module(draft_article=draft_article, remove_duplicate=remove_duplicate)
diff --git a/src/storm_wiki/modules/__init__.py b/src/knowledge_storm/storm_wiki/modules/__init__.py
similarity index 74%
rename from src/storm_wiki/modules/__init__.py
rename to src/knowledge_storm/storm_wiki/modules/__init__.py
index 51ee0121..9419a314 100644
--- a/src/storm_wiki/modules/__init__.py
+++ b/src/knowledge_storm/storm_wiki/modules/__init__.py
@@ -1,4 +1,4 @@
 from .knowledge_curation import *
 from .persona_generator import *
 from .retriever import *
-from .storm_dataclass import *
\ No newline at end of file
+from .storm_dataclass import *
diff --git a/src/storm_wiki/modules/article_generation.py b/src/knowledge_storm/storm_wiki/modules/article_generation.py
similarity index 96%
rename from src/storm_wiki/modules/article_generation.py
rename to src/knowledge_storm/storm_wiki/modules/article_generation.py
index 0dfb76de..a114b3ec 100644
--- a/src/storm_wiki/modules/article_generation.py
+++ b/src/knowledge_storm/storm_wiki/modules/article_generation.py
@@ -5,10 +5,11 @@
 from typing import List, Union
 
 import dspy
-from interface import ArticleGenerationModule
-from storm_wiki.modules.callback import BaseCallbackHandler
-from storm_wiki.modules.storm_dataclass import StormInformationTable, StormArticle, StormInformation
-from utils import ArticleTextProcessing
+
+from .callback import BaseCallbackHandler
+from .storm_dataclass import StormInformationTable, StormArticle, StormInformation
+from ...interface import ArticleGenerationModule
+from ...utils import ArticleTextProcessing
 
 
 class StormArticleGenerationModule(ArticleGenerationModule):
diff --git a/src/storm_wiki/modules/article_polish.py b/src/knowledge_storm/storm_wiki/modules/article_polish.py
similarity index 96%
rename from src/storm_wiki/modules/article_polish.py
rename to src/knowledge_storm/storm_wiki/modules/article_polish.py
index 5f38f058..b70bb834 100644
--- a/src/storm_wiki/modules/article_polish.py
+++ b/src/knowledge_storm/storm_wiki/modules/article_polish.py
@@ -2,9 +2,10 @@
 from typing import Union
 
 import dspy
-from interface import ArticlePolishingModule
-from storm_wiki.modules.storm_dataclass import StormArticle
-from utils import ArticleTextProcessing
+
+from .storm_dataclass import StormArticle
+from ...interface import ArticlePolishingModule
+from ...utils import ArticleTextProcessing
 
 
 class StormArticlePolishingModule(ArticlePolishingModule):
diff --git a/src/storm_wiki/modules/callback.py b/src/knowledge_storm/storm_wiki/modules/callback.py
similarity index 98%
rename from src/storm_wiki/modules/callback.py
rename to src/knowledge_storm/storm_wiki/modules/callback.py
index 945a45db..a4b702d4 100644
--- a/src/storm_wiki/modules/callback.py
+++ b/src/knowledge_storm/storm_wiki/modules/callback.py
@@ -31,4 +31,4 @@ def on_direct_outline_generation_end(self, outline: str, **kwargs):
 
     def on_outline_refinement_end(self, outline: str, **kwargs):
         """Run when the outline refinement finishes."""
-        pass
\ No newline at end of file
+        pass
diff --git a/src/storm_wiki/modules/knowledge_curation.py b/src/knowledge_storm/storm_wiki/modules/knowledge_curation.py
similarity index 97%
rename from src/storm_wiki/modules/knowledge_curation.py
rename to src/knowledge_storm/storm_wiki/modules/knowledge_curation.py
index 4fe7f159..8e881c65 100644
--- a/src/storm_wiki/modules/knowledge_curation.py
+++ b/src/knowledge_storm/storm_wiki/modules/knowledge_curation.py
@@ -5,14 +5,16 @@
 from typing import Union, List, Tuple, Optional, Dict
 
 import dspy
-from interface import KnowledgeCurationModule, Retriever
-from storm_wiki.modules.callback import BaseCallbackHandler
-from storm_wiki.modules.persona_generator import StormPersonaGenerator
-from storm_wiki.modules.storm_dataclass import DialogueTurn, StormInformationTable, StormInformation
-from utils import ArticleTextProcessing
+
+from .callback import BaseCallbackHandler
+from .persona_generator import StormPersonaGenerator
+from .storm_dataclass import DialogueTurn, StormInformationTable, StormInformation
+from ...interface import KnowledgeCurationModule, Retriever
+from ...utils import ArticleTextProcessing
 
 try:
     from streamlit.runtime.scriptrunner import add_script_run_ctx
+
     streamlit_connection = True
 except ImportError as err:
     streamlit_connection = False
diff --git a/src/storm_wiki/modules/outline_generation.py b/src/knowledge_storm/storm_wiki/modules/outline_generation.py
similarity index 96%
rename from src/storm_wiki/modules/outline_generation.py
rename to src/knowledge_storm/storm_wiki/modules/outline_generation.py
index 2b09d523..1f45b1c2 100644
--- a/src/storm_wiki/modules/outline_generation.py
+++ b/src/knowledge_storm/storm_wiki/modules/outline_generation.py
@@ -1,10 +1,11 @@
 from typing import Union, Optional, Tuple
 
 import dspy
-from interface import OutlineGenerationModule
-from storm_wiki.modules.callback import BaseCallbackHandler
-from storm_wiki.modules.storm_dataclass import StormInformationTable, StormArticle
-from utils import ArticleTextProcessing
+
+from .callback import BaseCallbackHandler
+from .storm_dataclass import StormInformationTable, StormArticle
+from ...interface import OutlineGenerationModule
+from ...utils import ArticleTextProcessing
 
 
 class StormOutlineGenerationModule(OutlineGenerationModule):
diff --git a/src/storm_wiki/modules/persona_generator.py b/src/knowledge_storm/storm_wiki/modules/persona_generator.py
similarity index 100%
rename from src/storm_wiki/modules/persona_generator.py
rename to src/knowledge_storm/storm_wiki/modules/persona_generator.py
diff --git a/src/storm_wiki/modules/internet_source_restrictions.json b/src/knowledge_storm/storm_wiki/modules/retriever.py
similarity index 72%
rename from src/storm_wiki/modules/internet_source_restrictions.json
rename to src/knowledge_storm/storm_wiki/modules/retriever.py
index 0a71e1ae..179ae99b 100644
--- a/src/storm_wiki/modules/internet_source_restrictions.json
+++ b/src/knowledge_storm/storm_wiki/modules/retriever.py
@@ -1,5 +1,15 @@
-{
-  "generally_unreliable": [
+from typing import Union, List
+from urllib.parse import urlparse
+
+import dspy
+
+from .storm_dataclass import StormInformation
+from ...interface import Retriever, Information
+from ...utils import ArticleTextProcessing
+
+# Internet source restrictions according to Wikipedia standard:
+# https://en.wikipedia.org/wiki/Wikipedia:Reliable_sources/Perennial_sources
+GENERALLY_UNRELIABLE = {
     "112_Ukraine",
     "Ad_Fontes_Media",
     "AlterNet",
@@ -139,9 +149,8 @@
     "WordPress.com",
     "Worldometer",
     "YouTube",
-    "ZDNet"
-  ],
-  "deprecated": [
+    "ZDNet"}
+DEPRECATED = {
     "Al_Mayadeen",
     "ANNA_News",
     "Baidu_Baike",
@@ -189,8 +198,8 @@
     "Voltaire_Network",
     "WorldNetDaily",
     "Zero_Hedge"
-  ],
-  "blacklisted": [
+}
+BLACKLISTED = {
     "Advameg",
     "bestgore.com",
     "Breitbart_News",
@@ -210,5 +219,32 @@
     "Swarajya",
     "Veterans_Today",
     "ZoomInfo"
-  ]
 }
+
+
+def is_valid_wikipedia_source(url):
+    parsed_url = urlparse(url)
+    # Check if the URL is from a reliable domain
+    combined_set = GENERALLY_UNRELIABLE | DEPRECATED | BLACKLISTED
+    for domain in combined_set:
+        if domain in parsed_url.netloc:
+            return False
+
+    return True
+
+
+class StormRetriever(Retriever):
+    def __init__(self, rm: dspy.Retrieve, k=3):
+        super().__init__(search_top_k=k)
+        self._rm = rm
+        if hasattr(rm, 'is_valid_source'):
+            rm.is_valid_source = is_valid_wikipedia_source
+
+    def retrieve(self, query: Union[str, List[str]], exclude_urls: List[str] = []) -> List[Information]:
+        retrieved_data_list = self._rm(query_or_queries=query, exclude_urls=exclude_urls)
+        for data in retrieved_data_list:
+            for i in range(len(data['snippets'])):
+                # STORM generate the article with citations. We do not consider multi-hop citations.
+                # Remove citations in the source to avoid confusion.
+                data['snippets'][i] = ArticleTextProcessing.remove_citations(data['snippets'][i])
+        return [StormInformation.from_dict(data) for data in retrieved_data_list]
diff --git a/src/storm_wiki/modules/storm_dataclass.py b/src/knowledge_storm/storm_wiki/modules/storm_dataclass.py
similarity index 99%
rename from src/storm_wiki/modules/storm_dataclass.py
rename to src/knowledge_storm/storm_wiki/modules/storm_dataclass.py
index d75760ce..4f54ec46 100644
--- a/src/storm_wiki/modules/storm_dataclass.py
+++ b/src/knowledge_storm/storm_wiki/modules/storm_dataclass.py
@@ -4,10 +4,11 @@
 from typing import Union, Optional, Any, List, Tuple, Dict
 
 import numpy as np
-from interface import Information, InformationTable, Article, ArticleSectionNode
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
-from utils import ArticleTextProcessing, FileIOHelper
+
+from ...interface import Information, InformationTable, Article, ArticleSectionNode
+from ...utils import ArticleTextProcessing, FileIOHelper
 
 
 class StormInformation(Information):
diff --git a/src/utils.py b/src/knowledge_storm/utils.py
similarity index 100%
rename from src/utils.py
rename to src/knowledge_storm/utils.py
diff --git a/src/storm_wiki/modules/retriever.py b/src/storm_wiki/modules/retriever.py
deleted file mode 100644
index 79cc2060..00000000
--- a/src/storm_wiki/modules/retriever.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import json
-import os
-from typing import Union, List
-from urllib.parse import urlparse
-
-import dspy
-import storm_wiki.modules.storm_dataclass as storm_dataclass
-from interface import Retriever, Information
-from rm import YouRM
-from utils import ArticleTextProcessing
-
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-with open(os.path.join(SCRIPT_DIR, 'internet_source_restrictions.json')) as f:
-    domain_restriction_dict = json.load(f)
-    GENERALLY_UNRELIABLE = set(domain_restriction_dict["generally_unreliable"])
-    DEPRECATED = set(domain_restriction_dict["deprecated"])
-    BLACKLISTED = set(domain_restriction_dict["blacklisted"])
-
-
-def is_valid_wikipedia_source(url):
-    parsed_url = urlparse(url)
-    # Check if the URL is from a reliable domain
-    combined_set = GENERALLY_UNRELIABLE | DEPRECATED | BLACKLISTED
-    for domain in combined_set:
-        if domain in parsed_url.netloc:
-            return False
-
-    return True
-
-
-class StormRetriever(Retriever):
-    def __init__(self, rm: dspy.Retrieve, k=3):
-        super().__init__(search_top_k=k)
-        self._rm = rm
-        if hasattr(rm, 'is_valid_source'):
-            rm.is_valid_source = is_valid_wikipedia_source
-
-    def retrieve(self, query: Union[str, List[str]], exclude_urls: List[str] = []) -> List[Information]:
-        retrieved_data_list = self._rm(query_or_queries=query, exclude_urls=exclude_urls)
-        for data in retrieved_data_list:
-            for i in range(len(data['snippets'])):
-                # STORM generate the article with citations. We do not consider multi-hop citations.
-                # Remove citations in the source to avoid confusion.
-                data['snippets'][i] = ArticleTextProcessing.remove_citations(data['snippets'][i])
-        return [storm_dataclass.StormInformation.from_dict(data) for data in retrieved_data_list]

From 9cf9eefdd04aac7888d95c495a4cd9a894c82187 Mon Sep 17 00:00:00 2001
From: shaoyijia <shaoyj.echo@gmail.com>
Date: Mon, 15 Jul 2024 22:26:03 +0800
Subject: [PATCH 02/10] Adjust directory structure: src/knowledge_storm ->
 knowledge_storm.

---
 .../__init__.py                               |  0
 .../interface.py                              |  0
 .../knowledge_storm => knowledge_storm}/lm.py | 45 +++++++++++++++++--
 .../knowledge_storm => knowledge_storm}/rm.py |  0
 .../storm_wiki/__init__.py                    |  0
 .../storm_wiki/engine.py                      |  0
 .../storm_wiki/modules/__init__.py            |  0
 .../storm_wiki/modules/article_generation.py  |  0
 .../storm_wiki/modules/article_polish.py      |  0
 .../storm_wiki/modules/callback.py            |  0
 .../storm_wiki/modules/knowledge_curation.py  |  0
 .../storm_wiki/modules/outline_generation.py  |  0
 .../storm_wiki/modules/persona_generator.py   |  0
 .../storm_wiki/modules/retriever.py           |  0
 .../storm_wiki/modules/storm_dataclass.py     |  0
 .../utils.py                                  |  3 ++
 setup.py                                      |  3 +-
 17 files changed, 45 insertions(+), 6 deletions(-)
 rename {src/knowledge_storm => knowledge_storm}/__init__.py (100%)
 rename {src/knowledge_storm => knowledge_storm}/interface.py (100%)
 rename {src/knowledge_storm => knowledge_storm}/lm.py (91%)
 rename {src/knowledge_storm => knowledge_storm}/rm.py (100%)
 rename {src/knowledge_storm => knowledge_storm}/storm_wiki/__init__.py (100%)
 rename {src/knowledge_storm => knowledge_storm}/storm_wiki/engine.py (100%)
 rename {src/knowledge_storm => knowledge_storm}/storm_wiki/modules/__init__.py (100%)
 rename {src/knowledge_storm => knowledge_storm}/storm_wiki/modules/article_generation.py (100%)
 rename {src/knowledge_storm => knowledge_storm}/storm_wiki/modules/article_polish.py (100%)
 rename {src/knowledge_storm => knowledge_storm}/storm_wiki/modules/callback.py (100%)
 rename {src/knowledge_storm => knowledge_storm}/storm_wiki/modules/knowledge_curation.py (100%)
 rename {src/knowledge_storm => knowledge_storm}/storm_wiki/modules/outline_generation.py (100%)
 rename {src/knowledge_storm => knowledge_storm}/storm_wiki/modules/persona_generator.py (100%)
 rename {src/knowledge_storm => knowledge_storm}/storm_wiki/modules/retriever.py (100%)
 rename {src/knowledge_storm => knowledge_storm}/storm_wiki/modules/storm_dataclass.py (100%)
 rename {src/knowledge_storm => knowledge_storm}/utils.py (99%)

diff --git a/src/knowledge_storm/__init__.py b/knowledge_storm/__init__.py
similarity index 100%
rename from src/knowledge_storm/__init__.py
rename to knowledge_storm/__init__.py
diff --git a/src/knowledge_storm/interface.py b/knowledge_storm/interface.py
similarity index 100%
rename from src/knowledge_storm/interface.py
rename to knowledge_storm/interface.py
diff --git a/src/knowledge_storm/lm.py b/knowledge_storm/lm.py
similarity index 91%
rename from src/knowledge_storm/lm.py
rename to knowledge_storm/lm.py
index 60dff4d7..886137fd 100644
--- a/src/knowledge_storm/lm.py
+++ b/knowledge_storm/lm.py
@@ -25,13 +25,10 @@ def __init__(
             self,
             model: str = "gpt-3.5-turbo-instruct",
             api_key: Optional[str] = None,
-            api_provider: Literal["openai", "azure"] = "openai",
-            api_base: Optional[str] = None,
             model_type: Literal["chat", "text"] = None,
             **kwargs
     ):
-        super().__init__(model=model, api_key=api_key, api_provider=api_provider, api_base=api_base,
-                         model_type=model_type, **kwargs)
+        super().__init__(model=model, api_key=api_key, model_type=model_type, **kwargs)
         self._token_usage_lock = threading.Lock()
         self.prompt_tokens = 0
         self.completion_tokens = 0
@@ -108,6 +105,46 @@ def __call__(
         return completions
 
 
+class AzureOpenAIModel(dspy.AzureOpenAI):
+    """A wrapper class for dspy.AzureOpenAI."""
+    def __init__(
+            self,
+            api_base: str,
+            api_version: str,
+            model: str = "gpt-3.5-turbo-instruct",
+            api_key: Optional[str] = None,
+            model_type: Literal["chat", "text"] = "chat",
+            **kwargs,
+    ):
+        super().__init__(
+            api_base=api_base, api_version=api_version, model=model, api_key=api_key, model_type=model_type, **kwargs)
+        self._token_usage_lock = threading.Lock()
+        self.prompt_tokens = 0
+        self.completion_tokens = 0
+
+    def log_usage(self, response):
+        """Log the total tokens from the OpenAI API response.
+        Override log_usage() in dspy.AzureOpenAI for tracking accumulated token usage."""
+        usage_data = response.get('usage')
+        if usage_data:
+            with self._token_usage_lock:
+                self.prompt_tokens += usage_data.get('prompt_tokens', 0)
+                self.completion_tokens += usage_data.get('completion_tokens', 0)
+
+    def get_usage_and_reset(self):
+        """Get the total tokens used and reset the token usage."""
+        usage = {
+            self.kwargs.get('model') or self.kwargs.get('engine'):
+                {'prompt_tokens': self.prompt_tokens, 'completion_tokens': self.completion_tokens}
+        }
+        self.prompt_tokens = 0
+        self.completion_tokens = 0
+
+        return usage
+
+
+
+
 class ClaudeModel(dspy.dsp.modules.lm.LM):
     """Copied from dspy/dsp/modules/anthropic.py with the addition of tracking token usage."""
 
diff --git a/src/knowledge_storm/rm.py b/knowledge_storm/rm.py
similarity index 100%
rename from src/knowledge_storm/rm.py
rename to knowledge_storm/rm.py
diff --git a/src/knowledge_storm/storm_wiki/__init__.py b/knowledge_storm/storm_wiki/__init__.py
similarity index 100%
rename from src/knowledge_storm/storm_wiki/__init__.py
rename to knowledge_storm/storm_wiki/__init__.py
diff --git a/src/knowledge_storm/storm_wiki/engine.py b/knowledge_storm/storm_wiki/engine.py
similarity index 100%
rename from src/knowledge_storm/storm_wiki/engine.py
rename to knowledge_storm/storm_wiki/engine.py
diff --git a/src/knowledge_storm/storm_wiki/modules/__init__.py b/knowledge_storm/storm_wiki/modules/__init__.py
similarity index 100%
rename from src/knowledge_storm/storm_wiki/modules/__init__.py
rename to knowledge_storm/storm_wiki/modules/__init__.py
diff --git a/src/knowledge_storm/storm_wiki/modules/article_generation.py b/knowledge_storm/storm_wiki/modules/article_generation.py
similarity index 100%
rename from src/knowledge_storm/storm_wiki/modules/article_generation.py
rename to knowledge_storm/storm_wiki/modules/article_generation.py
diff --git a/src/knowledge_storm/storm_wiki/modules/article_polish.py b/knowledge_storm/storm_wiki/modules/article_polish.py
similarity index 100%
rename from src/knowledge_storm/storm_wiki/modules/article_polish.py
rename to knowledge_storm/storm_wiki/modules/article_polish.py
diff --git a/src/knowledge_storm/storm_wiki/modules/callback.py b/knowledge_storm/storm_wiki/modules/callback.py
similarity index 100%
rename from src/knowledge_storm/storm_wiki/modules/callback.py
rename to knowledge_storm/storm_wiki/modules/callback.py
diff --git a/src/knowledge_storm/storm_wiki/modules/knowledge_curation.py b/knowledge_storm/storm_wiki/modules/knowledge_curation.py
similarity index 100%
rename from src/knowledge_storm/storm_wiki/modules/knowledge_curation.py
rename to knowledge_storm/storm_wiki/modules/knowledge_curation.py
diff --git a/src/knowledge_storm/storm_wiki/modules/outline_generation.py b/knowledge_storm/storm_wiki/modules/outline_generation.py
similarity index 100%
rename from src/knowledge_storm/storm_wiki/modules/outline_generation.py
rename to knowledge_storm/storm_wiki/modules/outline_generation.py
diff --git a/src/knowledge_storm/storm_wiki/modules/persona_generator.py b/knowledge_storm/storm_wiki/modules/persona_generator.py
similarity index 100%
rename from src/knowledge_storm/storm_wiki/modules/persona_generator.py
rename to knowledge_storm/storm_wiki/modules/persona_generator.py
diff --git a/src/knowledge_storm/storm_wiki/modules/retriever.py b/knowledge_storm/storm_wiki/modules/retriever.py
similarity index 100%
rename from src/knowledge_storm/storm_wiki/modules/retriever.py
rename to knowledge_storm/storm_wiki/modules/retriever.py
diff --git a/src/knowledge_storm/storm_wiki/modules/storm_dataclass.py b/knowledge_storm/storm_wiki/modules/storm_dataclass.py
similarity index 100%
rename from src/knowledge_storm/storm_wiki/modules/storm_dataclass.py
rename to knowledge_storm/storm_wiki/modules/storm_dataclass.py
diff --git a/src/knowledge_storm/utils.py b/knowledge_storm/utils.py
similarity index 99%
rename from src/knowledge_storm/utils.py
rename to knowledge_storm/utils.py
index cc1a6e58..5cf6f457 100644
--- a/src/knowledge_storm/utils.py
+++ b/knowledge_storm/utils.py
@@ -1,5 +1,6 @@
 import concurrent.futures
 import json
+import logging
 import os
 import pickle
 import re
@@ -11,6 +12,8 @@
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from trafilatura import extract
 
+logging.getLogger("httpx").setLevel(logging.WARNING)  # Disable INFO logging for httpx.
+
 
 def load_api_key(toml_file_path):
     try:
diff --git a/setup.py b/setup.py
index 00eb7a71..120927b3 100644
--- a/setup.py
+++ b/setup.py
@@ -24,8 +24,7 @@
     long_description_content_type="text/markdown",
     url="https://github.com/stanford-oval/storm",
     license="MIT License",
-    package_dir={"": "src"},
-    packages=find_packages("src"),
+    packages=find_packages(),
     classifiers=[
         "Development Status :: 3 - Alpha",
         "License :: OSI Approved :: MIT License",

From 97ca8501f36cd3250a0d68fdb02547e975d4e662 Mon Sep 17 00:00:00 2001
From: shaoyijia <shaoyj.echo@gmail.com>
Date: Mon, 15 Jul 2024 22:59:32 +0800
Subject: [PATCH 03/10] Fix AzureOpenAIModel.

---
 knowledge_storm/lm.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/knowledge_storm/lm.py b/knowledge_storm/lm.py
index 886137fd..5da0ebf8 100644
--- a/knowledge_storm/lm.py
+++ b/knowledge_storm/lm.py
@@ -109,8 +109,8 @@ class AzureOpenAIModel(dspy.AzureOpenAI):
     """A wrapper class for dspy.AzureOpenAI."""
     def __init__(
             self,
-            api_base: str,
-            api_version: str,
+            api_base: Optional[str] = None,
+            api_version: Optional[str] = None,
             model: str = "gpt-3.5-turbo-instruct",
             api_key: Optional[str] = None,
             model_type: Literal["chat", "text"] = "chat",
@@ -143,8 +143,6 @@ def get_usage_and_reset(self):
         return usage
 
 
-
-
 class ClaudeModel(dspy.dsp.modules.lm.LM):
     """Copied from dspy/dsp/modules/anthropic.py with the addition of tracking token usage."""
 

From 708fc04fdd5c0a19345148fd3db0833565672b45 Mon Sep 17 00:00:00 2001
From: shaoyijia <shaoyj.echo@gmail.com>
Date: Mon, 15 Jul 2024 22:59:48 +0800
Subject: [PATCH 04/10] Update example scripts.

---
 examples/run_storm_wiki_claude.py            | 12 +++----
 examples/run_storm_wiki_gpt.py               | 34 +++++++++++---------
 examples/run_storm_wiki_gpt_with_VectorRM.py | 29 ++++++++++-------
 examples/run_storm_wiki_mistral.py           | 12 +++----
 4 files changed, 47 insertions(+), 40 deletions(-)

diff --git a/examples/run_storm_wiki_claude.py b/examples/run_storm_wiki_claude.py
index f29ba1ce..31fef1e1 100644
--- a/examples/run_storm_wiki_claude.py
+++ b/examples/run_storm_wiki_claude.py
@@ -17,14 +17,12 @@
 """
 
 import os
-import sys
 from argparse import ArgumentParser
 
-sys.path.append('./src')
-from lm import ClaudeModel
-from rm import YouRM, BingSearch
-from storm_wiki.engine import STORMWikiRunnerArguments, STORMWikiRunner, STORMWikiLMConfigs
-from utils import load_api_key
+from knowledge_storm import STORMWikiRunnerArguments, STORMWikiRunner, STORMWikiLMConfigs
+from knowledge_storm.lm import ClaudeModel
+from knowledge_storm.rm import YouRM, BingSearch
+from knowledge_storm.utils import load_api_key
 
 
 def main(args):
@@ -116,4 +114,4 @@ def main(args):
     parser.add_argument('--remove-duplicate', action='store_true',
                         help='If True, remove duplicate content from the article.')
 
-    main(parser.parse_args())
\ No newline at end of file
+    main(parser.parse_args())
diff --git a/examples/run_storm_wiki_gpt.py b/examples/run_storm_wiki_gpt.py
index f7639d69..b7968152 100644
--- a/examples/run_storm_wiki_gpt.py
+++ b/examples/run_storm_wiki_gpt.py
@@ -20,14 +20,12 @@
 """
 
 import os
-import sys
 from argparse import ArgumentParser
 
-sys.path.append('./src')
-from lm import OpenAIModel
-from rm import YouRM, BingSearch
-from storm_wiki.engine import STORMWikiRunnerArguments, STORMWikiRunner, STORMWikiLMConfigs
-from utils import load_api_key
+from knowledge_storm import STORMWikiRunnerArguments, STORMWikiRunner, STORMWikiLMConfigs
+from knowledge_storm.lm import OpenAIModel, AzureOpenAIModel
+from knowledge_storm.rm import YouRM, BingSearch
+from knowledge_storm.utils import load_api_key
 
 
 def main(args):
@@ -35,23 +33,29 @@ def main(args):
     lm_configs = STORMWikiLMConfigs()
     openai_kwargs = {
         'api_key': os.getenv("OPENAI_API_KEY"),
-        'api_provider': os.getenv('OPENAI_API_TYPE'),
         'temperature': 1.0,
         'top_p': 0.9,
-        'api_base': os.getenv('AZURE_API_BASE'),
-        'api_version': os.getenv('AZURE_API_VERSION'),
     }
 
+    ModelClass = OpenAIModel if os.getenv('OPENAI_API_TYPE') == 'openai' else AzureOpenAIModel
+    # If you are using Azure service, make sure the model name matches your own deployed model name.
+    # The default name here is only used for demonstration and may not match your case.
+    gpt_35_model_name = 'gpt-3.5-turbo' if os.getenv('OPENAI_API_TYPE') == 'openai' else 'gpt-35-turbo'
+    gpt_4_model_name = 'gpt-4o'
+    if os.getenv('OPENAI_API_TYPE') == 'azure':
+        openai_kwargs['api_base'] = os.getenv('AZURE_API_BASE')
+        openai_kwargs['api_version'] = os.getenv('AZURE_API_VERSION')
+
     # STORM is a LM system so different components can be powered by different models.
     # For a good balance between cost and quality, you can choose a cheaper/faster model for conv_simulator_lm
     # which is used to split queries, synthesize answers in the conversation. We recommend using stronger models
     # for outline_gen_lm which is responsible for organizing the collected information, and article_gen_lm
     # which is responsible for generating sections with citations.
-    conv_simulator_lm = OpenAIModel(model='gpt-3.5-turbo', max_tokens=500, **openai_kwargs)
-    question_asker_lm = OpenAIModel(model='gpt-3.5-turbo', max_tokens=500, **openai_kwargs)
-    outline_gen_lm = OpenAIModel(model='gpt-4-0125-preview', max_tokens=400, **openai_kwargs)
-    article_gen_lm = OpenAIModel(model='gpt-4-0125-preview', max_tokens=700, **openai_kwargs)
-    article_polish_lm = OpenAIModel(model='gpt-4-0125-preview', max_tokens=4000, **openai_kwargs)
+    conv_simulator_lm = ModelClass(model=gpt_35_model_name, max_tokens=500, **openai_kwargs)
+    question_asker_lm = ModelClass(model=gpt_35_model_name, max_tokens=500, **openai_kwargs)
+    outline_gen_lm = ModelClass(model=gpt_4_model_name, max_tokens=400, **openai_kwargs)
+    article_gen_lm = ModelClass(model=gpt_4_model_name, max_tokens=700, **openai_kwargs)
+    article_polish_lm = ModelClass(model=gpt_4_model_name, max_tokens=4000, **openai_kwargs)
 
     lm_configs.set_conv_simulator_lm(conv_simulator_lm)
     lm_configs.set_question_asker_lm(question_asker_lm)
@@ -122,4 +126,4 @@ def main(args):
     parser.add_argument('--remove-duplicate', action='store_true',
                         help='If True, remove duplicate content from the article.')
 
-    main(parser.parse_args())
\ No newline at end of file
+    main(parser.parse_args())
diff --git a/examples/run_storm_wiki_gpt_with_VectorRM.py b/examples/run_storm_wiki_gpt_with_VectorRM.py
index c5dd4354..2c07ffc2 100644
--- a/examples/run_storm_wiki_gpt_with_VectorRM.py
+++ b/examples/run_storm_wiki_gpt_with_VectorRM.py
@@ -30,11 +30,10 @@
 import sys
 from argparse import ArgumentParser
 
-sys.path.append('./src')
-from storm_wiki.engine import STORMWikiRunnerArguments, STORMWikiRunner, STORMWikiLMConfigs
-from rm import VectorRM
-from lm import OpenAIModel
-from utils import load_api_key
+from knowledge_storm import STORMWikiRunnerArguments, STORMWikiRunner, STORMWikiLMConfigs
+from knowledge_storm.rm import VectorRM
+from knowledge_storm.lm import OpenAIModel, AzureOpenAIModel
+from knowledge_storm.utils import load_api_key
 
 
 def main(args):
@@ -45,21 +44,29 @@ def main(args):
     engine_lm_configs = STORMWikiLMConfigs()
     openai_kwargs = {
         'api_key': os.getenv("OPENAI_API_KEY"),
-        'api_provider': os.getenv('OPENAI_API_TYPE'),
         'temperature': 1.0,
         'top_p': 0.9,
     }
 
+    ModelClass = OpenAIModel if os.getenv('OPENAI_API_TYPE') == 'openai' else AzureOpenAIModel
+    # If you are using Azure service, make sure the model name matches your own deployed model name.
+    # The default name here is only used for demonstration and may not match your case.
+    gpt_35_model_name = 'gpt-3.5-turbo' if os.getenv('OPENAI_API_TYPE') == 'openai' else 'gpt-35-turbo'
+    gpt_4_model_name = 'gpt-4o'
+    if os.getenv('OPENAI_API_TYPE') == 'azure':
+        openai_kwargs['api_base'] = os.getenv('AZURE_API_BASE')
+        openai_kwargs['api_version'] = os.getenv('AZURE_API_VERSION')
+
     # STORM is a LM system so different components can be powered by different models.
     # For a good balance between cost and quality, you can choose a cheaper/faster model for conv_simulator_lm 
     # which is used to split queries, synthesize answers in the conversation. We recommend using stronger models
     # for outline_gen_lm which is responsible for organizing the collected information, and article_gen_lm
     # which is responsible for generating sections with citations.
-    conv_simulator_lm = OpenAIModel(model='gpt-3.5-turbo', max_tokens=500, **openai_kwargs)
-    question_asker_lm = OpenAIModel(model='gpt-3.5-turbo', max_tokens=500, **openai_kwargs)
-    outline_gen_lm = OpenAIModel(model='gpt-4-0125-preview', max_tokens=400, **openai_kwargs)
-    article_gen_lm = OpenAIModel(model='gpt-4-0125-preview', max_tokens=700, **openai_kwargs)
-    article_polish_lm = OpenAIModel(model='gpt-4-0125-preview', max_tokens=4000, **openai_kwargs)
+    conv_simulator_lm = ModelClass(model=gpt_35_model_name, max_tokens=500, **openai_kwargs)
+    question_asker_lm = ModelClass(model=gpt_35_model_name, max_tokens=500, **openai_kwargs)
+    outline_gen_lm = ModelClass(model=gpt_4_model_name, max_tokens=400, **openai_kwargs)
+    article_gen_lm = ModelClass(model=gpt_4_model_name, max_tokens=700, **openai_kwargs)
+    article_polish_lm = ModelClass(model=gpt_4_model_name, max_tokens=4000, **openai_kwargs)
 
     engine_lm_configs.set_conv_simulator_lm(conv_simulator_lm)
     engine_lm_configs.set_question_asker_lm(question_asker_lm)
diff --git a/examples/run_storm_wiki_mistral.py b/examples/run_storm_wiki_mistral.py
index f7bc22dd..eb6a4ff6 100644
--- a/examples/run_storm_wiki_mistral.py
+++ b/examples/run_storm_wiki_mistral.py
@@ -16,16 +16,14 @@
         storm_gen_article_polished.txt  # Polished final article (if args.do_polish_article is True)
 """
 import os
-import sys
 from argparse import ArgumentParser
 
 from dspy import Example
 
-sys.path.append('./src')
-from lm import VLLMClient
-from rm import YouRM, BingSearch
-from storm_wiki.engine import STORMWikiRunnerArguments, STORMWikiRunner, STORMWikiLMConfigs
-from utils import load_api_key
+from knowledge_storm import STORMWikiRunnerArguments, STORMWikiRunner, STORMWikiLMConfigs
+from knowledge_storm.lm import VLLMClient
+from knowledge_storm.rm import YouRM, BingSearch
+from knowledge_storm.utils import load_api_key
 
 
 def main(args):
@@ -174,4 +172,4 @@ def main(args):
     parser.add_argument('--remove-duplicate', action='store_true',
                         help='If True, remove duplicate content from the article.')
 
-    main(parser.parse_args())
\ No newline at end of file
+    main(parser.parse_args())

From 03ace467b8fc1aa93a8bed3fcf65c5ebb355591d Mon Sep 17 00:00:00 2001
From: shaoyijia <shaoyj.echo@gmail.com>
Date: Tue, 16 Jul 2024 11:34:37 +0800
Subject: [PATCH 05/10] Update Readme.md.

---
 README.md | 172 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 103 insertions(+), 69 deletions(-)

diff --git a/README.md b/README.md
index 182b5086..7ac80e18 100644
--- a/README.md
+++ b/README.md
@@ -6,15 +6,16 @@
 
 <p align="center">
 | <a href="http://storm.genie.stanford.edu"><b>Research preview</b></a> | <a href="https://arxiv.org/abs/2402.14207"><b>Paper</b></a> | <a href="https://storm-project.stanford.edu/"><b>Website</b></a> |
-
+</p>
 
 **Latest News** 🔥
 
+- [2024/07] You can now install our package with `pip install knowledge-storm`!
 - [2024/07] We add `VectorRM` to support grounding on user-provided documents, complementing existing support of search engines (`YouRM`, `BingSearch`). (check out [#58](https://github.com/stanford-oval/storm/pull/58))
 - [2024/07] We release demo light for developers a minimal user interface built with streamlit framework in Python, handy for local development and demo hosting (checkout [#54](https://github.com/stanford-oval/storm/pull/54))
 - [2024/06] We will present STORM at NAACL 2024! Find us at Poster Session 2 on June 17 or check our [presentation material](assets/storm_naacl2024_slides.pdf). 
-- [2024/05] We add Bing Search support in [rm.py](src/rm.py). Test STORM with `GPT-4o` - we now configure the article generation part in our demo using `GPT-4o` model.
-- [2024/04] We release refactored version of STORM codebase! We define [interface](src/interface.py) for STORM pipeline and reimplement STORM-wiki (check out [`src/storm_wiki`](src/storm_wiki)) to demonstrate how to instantiate the pipeline. We provide API to support customization of different language models and retrieval/search integration.
+- [2024/05] We add Bing Search support in [rm.py](knowledge_storm/rm.py). Test STORM with `GPT-4o` - we now configure the article generation part in our demo using `GPT-4o` model.
+- [2024/04] We release refactored version of STORM codebase! We define [interface](knowledge_storm/interface.py) for STORM pipeline and reimplement STORM-wiki (check out [`src/storm_wiki`](knowledge_storm/storm_wiki)) to demonstrate how to instantiate the pipeline. We provide API to support customization of different language models and retrieval/search integration.
 
 ## Overview [(Try STORM now!)](https://storm.genie.stanford.edu/)
 
@@ -46,17 +47,17 @@ Based on the separation of the two stages, STORM is implemented in a highly modu
 
 
 
-## Getting started
+## Installation
 
-### 1. Setup
 
-Below, we provide a quick start guide to run STORM locally.
+To install the knowledge storm library, use `pip install knowledge-storm`. 
 
+You could also install the source code which allows you to modify the behavior of STORM engine directly.
 1. Clone the git repository.
-   ```shell
-   git clone https://github.com/stanford-oval/storm.git
-   cd storm
-   ```
+    ```shell
+    git clone https://github.com/stanford-oval/storm.git
+    cd storm
+    ```
    
 2. Install the required packages.
    ```shell
@@ -64,7 +65,77 @@ Below, we provide a quick start guide to run STORM locally.
    conda activate storm
    pip install -r requirements.txt
    ```
-3. Set up OpenAI API key (if you want to use OpenAI models to power STORM) and [You.com search API](https://api.you.com/) key. Create a file `secrets.toml` under the root directory and add the following content:
+   
+
+## API
+The STORM knowledge curation engine is defined as a simple Python `STORMWikiRunner` class.
+
+As STORM is working in the information curation layer, you need to set up the information retrieval module and language model module to create a `STORMWikiRunner` instance. Here is an example of using You.com search engine and OpenAI models.
+```python
+import os
+from knowledge_storm import STORMWikiRunnerArguments, STORMWikiRunner, STORMWikiLMConfigs
+from knowledge_storm.lm import OpenAIModel
+from knowledge_storm.rm import YouRM
+
+
+lm_configs = STORMWikiLMConfigs()
+openai_kwargs = {
+    'api_key': os.getenv("OPENAI_API_KEY"),
+    'temperature': 1.0,
+    'top_p': 0.9,
+}
+
+# STORM is a LM system so different components can be powered by different models to reach a good balance between cost and quality.
+# For a good practice, choose a cheaper/faster model for `conv_simulator_lm` which is used to split queries, synthesize answers in the conversation. Choose a more powerful model for `article_gen_lm` to generate verifiable text with citations.
+gpt_35 = OpenAIModel(model='gpt-3.5-turbo', max_tokens=500, **openai_kwargs)
+gpt_4 = OpenAIModel(model='gpt-4-o', max_tokens=3000, **openai_kwargs)
+
+lm_configs.set_conv_simulator_lm(gpt_35)
+lm_configs.set_question_asker_lm(gpt_35)
+lm_configs.set_outline_gen_lm(gpt_4)
+lm_configs.set_article_gen_lm(gpt_4)
+lm_configs.set_article_polish_lm(gpt_4)
+
+
+# Check out the STORMWikiRunnerArguments class for more configurations.
+engine_args = STORMWikiRunnerArguments(...)
+
+rm = YouRM(ydc_api_key=os.getenv('YDC_API_KEY'), k=engine_args.search_top_k)
+
+runner = STORMWikiRunner(engine_args, lm_configs, rm)
+```
+
+Currently, our package support:
+- `OpenAIModel`, `AzureOpenAIModel`, `ClaudeModel`, `VLLMClient`, `TGIClient`, `TogetherClient` as language model components
+- `YouRM`, `BingSearch`, `VectorRM` as retrieval components
+
+:star2: **PRs for integrating more language models into [knowledge_storm/lm.py](knowledge_storm/lm.py) and search engines/retrievers into [knowledge_storm/rm.py](knowledge_storm/rm.py) are highly appreciated!**
+
+The `STORMWikiRunner` instance can be evoked with the simple `run` method:
+```python
+topic = input('Topic: ')
+runner.run(
+    topic=topic,
+    do_research=True,
+    do_generate_outline=True,
+    do_generate_article=True,
+    do_polish_article=True,
+)
+runner.post_run()
+runner.summary()
+```
+- `do_research`: if True, simulate conversations with difference perspectives to collect information about the topic; otherwise, load the results.
+- `do_generate_outline`: if True, generate an outline for the topic; otherwise, load the results.
+- `do_generate_article`: if True, generate an article for the topic based on the outline and the collected information; otherwise, load the results.
+- `do_polish_article`: if True, polish the article by adding a summarization section and (optionally) removing duplicate content; otherwise, load the results.
+
+
+## Quick Start with Example Scripts
+
+We provide scripts in our [examples folder](examples) as a quick start to run STORM with different configurations.
+
+**To run STORM with `gpt` family models with default configurations:**
+1. We suggest using `secrets.toml` to set up the API keys. Create a file `secrets.toml` under the root directory and add the following content:
     ```shell
     # Set up OpenAI API key.
     OPENAI_API_KEY="your_openai_api_key"
@@ -77,72 +148,31 @@ Below, we provide a quick start guide to run STORM locally.
     # Set up You.com search API key.
     YDC_API_KEY="your_youcom_api_key"
     ```
+2. Run the following command.
+    ```
+    python examples/run_storm_wiki_gpt.py \
+        --output-dir $OUTPUT_DIR \
+        --retriever you \
+        --do-research \
+        --do-generate-outline \
+        --do-generate-article \
+        --do-polish-article
+    ```
 
+**To run STORM using your favorite language models or grounding on your own corpus:** Check out [examples/README.md](examples/README.md).
 
-### 2. Running STORM-wiki locally
-
-**To run STORM with `gpt` family models with default configurations**: Make sure you have set up the OpenAI API key and run the following command.
-
-```
-python examples/run_storm_wiki_gpt.py \
-    --output-dir $OUTPUT_DIR \
-    --retriever you \
-    --do-research \
-    --do-generate-outline \
-    --do-generate-article \
-    --do-polish-article
-```
-- `--do-research`: if True, simulate conversation to research the topic; otherwise, load the results.
-- `--do-generate-outline`: If True, generate an outline for the topic; otherwise, load the results.
-- `--do-generate-article`: If True, generate an article for the topic; otherwise, load the results.
-- `--do-polish-article`:  If True, polish the article by adding a summarization section and (optionally) removing duplicate content.
-
-
-We provide more example scripts under [`examples`](examples) to demonstrate how you can run STORM using your favorite language models or grounding on your own corpus.
-
-
-## Customize STORM 
 
-### Customization of the Pipeline
+## Customization of the Pipeline
 
-Besides running scripts in `examples`, you can customize STORM based on your own use case. STORM engine consists of 4 modules:
+If you have installed the source code, you can customize STORM based on your own use case. STORM engine consists of 4 modules:
 
 1. Knowledge Curation Module: Collects a broad coverage of information about the given topic.
 2. Outline Generation Module: Organizes the collected information by generating a hierarchical outline for the curated knowledge.
 3. Article Generation Module: Populates the generated outline with the collected information.
 4. Article Polishing Module: Refines and enhances the written article for better presentation.
 
-The interface for each module is defined in `src/interface.py`, while their implementations are instantiated in `src/storm_wiki/modules/*`. These modules can be customized according to your specific requirements (e.g., generating sections in bullet point format instead of full paragraphs).
+The interface for each module is defined in `knowledge_storm/interface.py`, while their implementations are instantiated in `knowledge_storm/storm_wiki/modules/*`. These modules can be customized according to your specific requirements (e.g., generating sections in bullet point format instead of full paragraphs).
 
-:star2: **You can share your customization of `Engine` by making PRs to this repo!**
-
-### Customization of Retriever Module
-
-As a knowledge curation engine, STORM grabs information from the Retriever module. The Retriever modules are implemented in [`src/rm.py`](src/rm.py). Currently, STORM supports the following retrievers:
-
-- `YouRM`: You.com search engine API
-- `BingSearch`: Bing Search API
-- `VectorRM`: a retrieval model that retrieves information from user provide corpus
-
-:star2: **PRs for integrating more search engines/retrievers are highly appreciated!**
-
-### Customization of Language Models
-
-STORM provides the following language model implementations in [`src/lm.py`](src/lm.py):
-
-- `OpenAIModel`
-- `ClaudeModel`
-- `VLLMClient`
-- `TGIClient`
-- `TogetherClient`
-
-:star2: **PRs for integrating more language model clients are highly appreciated!**
-
-:bulb: **For a good practice,** 
-
-- choose a cheaper/faster model for `conv_simulator_lm` which is used to split queries, synthesize answers in the conversation.
-- if you need to conduct the actual writing step, choose a more powerful model for `article_gen_lm`. Based on our experiments, weak models are bad at generating text with citations.
-- for open models, adding one-shot example can help it better follow instructions.
 
 Please refer to the scripts in the [`examples`](examples) directory for concrete guidance on customizing the language model used in the pipeline.
 
@@ -157,7 +187,7 @@ Please switch to the branch `NAACL-2024-code-backup`
 
 The FreshWiki dataset used in our experiments can be found in [./FreshWiki](FreshWiki).
     
-Run the following commands under [./src](src).
+Run the following commands under [./src](knowledge_storm).
 
 #### Pre-writing Stage
 For batch experiment on FreshWiki dataset:
@@ -196,7 +226,7 @@ python -m scripts.run_writing --input-source console --engine gpt-4 --do-polish-
 The generated article will be saved in `{output_dir}/{topic}/storm_gen_article.txt` and the references corresponding to citation index will be saved in `{output_dir}/{topic}/url_to_info.json`. If `--do-polish-article` is set, the polished article will be saved in `{output_dir}/{topic}/storm_gen_article_polished.txt`. 
 
 ### Customize the STORM Configurations
-We set up the default LLM configuration in `LLMConfigs` in [src/modules/utils.py](src/modules/utils.py). You can use `set_conv_simulator_lm()`,`set_question_asker_lm()`, `set_outline_gen_lm()`, `set_article_gen_lm()`, `set_article_polish_lm()` to override the default configuration. These functions take in an instance from `dspy.dsp.LM` or `dspy.dsp.HFModel`.
+We set up the default LLM configuration in `LLMConfigs` in [src/modules/utils.py](knowledge_storm/modules/utils.py). You can use `set_conv_simulator_lm()`,`set_question_asker_lm()`, `set_outline_gen_lm()`, `set_article_gen_lm()`, `set_article_polish_lm()` to override the default configuration. These functions take in an instance from `dspy.dsp.LM` or `dspy.dsp.HFModel`.
 
 
 ### Automatic Evaluation
@@ -224,7 +254,11 @@ For rubric grading, we use the [prometheus-13b-v1.0](https://huggingface.co/prom
 
 </details>
 
-## Contributions
+## Roadmap & Contributions
+Our team is actively working on:
+1. Human-in-the-Loop Functionalities: Supporting user participation in the knowledge curation process.
+2. Information Abstraction: Developing abstractions for curated information to support presentation formats beyond the Wikipedia-style report.
+
 If you have any questions or suggestions, please feel free to open an issue or pull request. We welcome contributions to improve the system and the codebase!
 
 Contact person: [Yijia Shao](mailto:shaoyj@stanford.edu) and [Yucheng Jiang](mailto:yuchengj@stanford.edu)

From 16a6903bf3ba7f9bac3b3dde999edb54d1e6cfac Mon Sep 17 00:00:00 2001
From: shaoyijia <shaoyj.echo@gmail.com>
Date: Tue, 16 Jul 2024 11:44:03 +0800
Subject: [PATCH 06/10] Update demo light to match the change.

---
 frontend/demo_light/README.md    |  3 ++-
 frontend/demo_light/demo_util.py | 15 ++++++++-------
 frontend/demo_light/storm.py     |  4 ----
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/frontend/demo_light/README.md b/frontend/demo_light/README.md
index 6a41c0e0..6ff58789 100644
--- a/frontend/demo_light/README.md
+++ b/frontend/demo_light/README.md
@@ -15,7 +15,8 @@ This is a minimal user interface for `STORMWikiRunner` which includes the follow
 </p>
 
 ## Setup
-1. Besides the required packages for `STORMWikiRunner`, you need to install additional packages:
+1. Make sure you have installed `knowledge-storm` or set up the source code correctly.
+2. Install additional packages required by the user interface:
     ```bash
     pip install -r requirements.txt
     ```
diff --git a/frontend/demo_light/demo_util.py b/frontend/demo_light/demo_util.py
index d940aa09..e8a51823 100644
--- a/frontend/demo_light/demo_util.py
+++ b/frontend/demo_light/demo_util.py
@@ -1,20 +1,22 @@
 import base64
 import datetime
-import io
 import json
 import os
 import re
 from typing import Optional
 
 import markdown
-import pdfkit
 import pytz
 import streamlit as st
-from lm import OpenAIModel
-from rm import YouRM
-from storm_wiki.engine import STORMWikiRunnerArguments, STORMWikiRunner, STORMWikiLMConfigs
-from storm_wiki.modules.callback import BaseCallbackHandler
 
+# If you install the source code instead of the `knowledge-storm` package,
+# Uncomment the following lines:
+# import sys
+# sys.path.append('../../')
+from knowledge_storm import STORMWikiRunnerArguments, STORMWikiRunner, STORMWikiLMConfigs
+from knowledge_storm.lm import OpenAIModel
+from knowledge_storm.rm import YouRM
+from knowledge_storm.storm_wiki.modules.callback import BaseCallbackHandler
 from stoc import stoc
 
 
@@ -529,7 +531,6 @@ def display_article_page(selected_article_name, selected_article_file_path_dict,
         _display_main_article(selected_article_file_path_dict)
 
 
-
 class StreamlitCallbackHandler(BaseCallbackHandler):
     def __init__(self, status_container):
         self.status_container = status_container
diff --git a/frontend/demo_light/storm.py b/frontend/demo_light/storm.py
index 9a0ae663..c68b88cf 100644
--- a/frontend/demo_light/storm.py
+++ b/frontend/demo_light/storm.py
@@ -1,12 +1,8 @@
 import os
-import sys
 
 script_dir = os.path.dirname(os.path.abspath(__file__))
 wiki_root_dir = os.path.dirname(os.path.dirname(script_dir))
 
-sys.path.append(os.path.normpath(os.path.join(script_dir, '../../src/storm_wiki')))
-sys.path.append(os.path.normpath(os.path.join(script_dir, '../../src')))
-
 import demo_util
 from pages_util import MyArticles, CreateNewArticle
 from streamlit_float import *

From 10b2646ef77800d65003108bda251f62de29dca5 Mon Sep 17 00:00:00 2001
From: shaoyijia <shaoyj.echo@gmail.com>
Date: Tue, 16 Jul 2024 11:53:37 +0800
Subject: [PATCH 07/10] Bump up the version.

v0.2.1 is used for testing pypi publication.
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 120927b3..8eb50150 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@
 
 setup(
     name="knowledge-storm",
-    version="0.2.1",
+    version="0.2.2",
     author="Yijia Shao, Yucheng Jiang",
     author_email="shaoyj@stanford.edu, yuchengj@stanford.edu",
     description="STORM: A language model-powered knowledge curation engine.",

From 441563d2ec80938a7222e2dd0e7c6a90cd2fa492 Mon Sep 17 00:00:00 2001
From: shaoyijia <shaoyj.echo@gmail.com>
Date: Tue, 16 Jul 2024 12:06:30 +0800
Subject: [PATCH 08/10] Nit.

---
 README.md | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 7ac80e18..ddd56fc7 100644
--- a/README.md
+++ b/README.md
@@ -77,37 +77,30 @@ from knowledge_storm import STORMWikiRunnerArguments, STORMWikiRunner, STORMWiki
 from knowledge_storm.lm import OpenAIModel
 from knowledge_storm.rm import YouRM
 
-
 lm_configs = STORMWikiLMConfigs()
 openai_kwargs = {
     'api_key': os.getenv("OPENAI_API_KEY"),
     'temperature': 1.0,
     'top_p': 0.9,
 }
-
 # STORM is a LM system so different components can be powered by different models to reach a good balance between cost and quality.
 # For a good practice, choose a cheaper/faster model for `conv_simulator_lm` which is used to split queries, synthesize answers in the conversation. Choose a more powerful model for `article_gen_lm` to generate verifiable text with citations.
 gpt_35 = OpenAIModel(model='gpt-3.5-turbo', max_tokens=500, **openai_kwargs)
 gpt_4 = OpenAIModel(model='gpt-4-o', max_tokens=3000, **openai_kwargs)
-
 lm_configs.set_conv_simulator_lm(gpt_35)
 lm_configs.set_question_asker_lm(gpt_35)
 lm_configs.set_outline_gen_lm(gpt_4)
 lm_configs.set_article_gen_lm(gpt_4)
 lm_configs.set_article_polish_lm(gpt_4)
-
-
 # Check out the STORMWikiRunnerArguments class for more configurations.
 engine_args = STORMWikiRunnerArguments(...)
-
 rm = YouRM(ydc_api_key=os.getenv('YDC_API_KEY'), k=engine_args.search_top_k)
-
 runner = STORMWikiRunner(engine_args, lm_configs, rm)
 ```
 
 Currently, our package support:
 - `OpenAIModel`, `AzureOpenAIModel`, `ClaudeModel`, `VLLMClient`, `TGIClient`, `TogetherClient` as language model components
-- `YouRM`, `BingSearch`, `VectorRM` as retrieval components
+- `YouRM`, `BingSearch`, `VectorRM` as retrieval module components
 
 :star2: **PRs for integrating more language models into [knowledge_storm/lm.py](knowledge_storm/lm.py) and search engines/retrievers into [knowledge_storm/rm.py](knowledge_storm/rm.py) are highly appreciated!**
 
@@ -174,8 +167,6 @@ If you have installed the source code, you can customize STORM based on your own
 The interface for each module is defined in `knowledge_storm/interface.py`, while their implementations are instantiated in `knowledge_storm/storm_wiki/modules/*`. These modules can be customized according to your specific requirements (e.g., generating sections in bullet point format instead of full paragraphs).
 
 
-Please refer to the scripts in the [`examples`](examples) directory for concrete guidance on customizing the language model used in the pipeline.
-
 ## Replicate NAACL2024 result
 
 Please switch to the branch `NAACL-2024-code-backup` 

From eaf858962578d3fd44e532351e4b5faceaa756d2 Mon Sep 17 00:00:00 2001
From: shaoyijia <shaoyj.echo@gmail.com>
Date: Wed, 17 Jul 2024 19:54:45 +0800
Subject: [PATCH 09/10] Merge ollama support (#81).

---
 README.md             | 2 +-
 knowledge_storm/lm.py | 1 +
 setup.py              | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ddd56fc7..806c5411 100644
--- a/README.md
+++ b/README.md
@@ -99,7 +99,7 @@ runner = STORMWikiRunner(engine_args, lm_configs, rm)
 ```
 
 Currently, our package support:
-- `OpenAIModel`, `AzureOpenAIModel`, `ClaudeModel`, `VLLMClient`, `TGIClient`, `TogetherClient` as language model components
+- `OpenAIModel`, `AzureOpenAIModel`, `ClaudeModel`, `VLLMClient`, `TGIClient`, `TogetherClient`, `OllamaClient` as language model components
 - `YouRM`, `BingSearch`, `VectorRM` as retrieval module components
 
 :star2: **PRs for integrating more language models into [knowledge_storm/lm.py](knowledge_storm/lm.py) and search engines/retrievers into [knowledge_storm/rm.py](knowledge_storm/rm.py) are highly appreciated!**
diff --git a/knowledge_storm/lm.py b/knowledge_storm/lm.py
index c3e1ce88..e1ec8e29 100644
--- a/knowledge_storm/lm.py
+++ b/knowledge_storm/lm.py
@@ -312,6 +312,7 @@ def _generate(self, prompt, **kwargs):
             print("Failed to parse JSON response:", response.text)
             raise Exception("Received invalid JSON response from server")
 
+
 class OllamaClient(dspy.OllamaLocal):
     """A wrapper class for dspy.OllamaClient."""
 
diff --git a/setup.py b/setup.py
index 8eb50150..e4fa40c1 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@
 
 setup(
     name="knowledge-storm",
-    version="0.2.2",
+    version="0.2.3",
     author="Yijia Shao, Yucheng Jiang",
     author_email="shaoyj@stanford.edu, yuchengj@stanford.edu",
     description="STORM: A language model-powered knowledge curation engine.",

From 3fa0e0e61c5ea12f406c60407bc7b79481d9fe66 Mon Sep 17 00:00:00 2001
From: shaoyijia <shaoyj.echo@gmail.com>
Date: Thu, 18 Jul 2024 11:23:26 +0800
Subject: [PATCH 10/10] Add a line break to readme.md.

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 806c5411..dae8b41a 100644
--- a/README.md
+++ b/README.md
@@ -84,7 +84,8 @@ openai_kwargs = {
     'top_p': 0.9,
 }
 # STORM is a LM system so different components can be powered by different models to reach a good balance between cost and quality.
-# For a good practice, choose a cheaper/faster model for `conv_simulator_lm` which is used to split queries, synthesize answers in the conversation. Choose a more powerful model for `article_gen_lm` to generate verifiable text with citations.
+# For a good practice, choose a cheaper/faster model for `conv_simulator_lm` which is used to split queries, synthesize answers in the conversation.
+# Choose a more powerful model for `article_gen_lm` to generate verifiable text with citations.
 gpt_35 = OpenAIModel(model='gpt-3.5-turbo', max_tokens=500, **openai_kwargs)
 gpt_4 = OpenAIModel(model='gpt-4-o', max_tokens=3000, **openai_kwargs)
 lm_configs.set_conv_simulator_lm(gpt_35)