Add new episodes

rlancemartin · Mar 26, 2023 · 71abda1 · 71abda1 · vercel · Mar 26, 2023
1 parent cbcdbd4
commit 71abda1
Show file tree

Hide file tree

Showing 7 changed files with 4,454 additions and 31 deletions.
diff --git a/public/0366.jpg b/public/0366.jpg
diff --git a/public/0367.jpg b/public/0367.jpg
diff --git a/scripts/audio_transcription/0366.txt b/scripts/audio_transcription/0366.txt
diff --git a/scripts/audio_transcription/0367.txt b/scripts/audio_transcription/0367.txt
diff --git a/scripts/audio_transcription/episodes.csv b/scripts/audio_transcription/episodes.csv
@@ -0,0 +1,3 @@
+,index,number,link,title,img
+0,"Sam Altman: OpenAI CEO on GPT-4, ChatGPT, and the Future of AI | Lex Fridman Podcast #367",367.0,https://www.youtube.com/watch?v=L_Guz73e6fw&list=UUSHZKyawb77ixDdsGog4iWA&index=1,"Sam Altman: OpenAI CEO on GPT-4, ChatGPT, and the Future of AI | Lex Fridman Podcast #367",https://i.ytimg.com/vi/L_Guz73e6fw/hqdefault.jpg?sqp=-oaymwEcCNACELwBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLBb0zSsh5mBrwsY8FfjV04i0x47Jg
+1,"Shannon Curry: Johnny Depp & Amber Heard Trial, Marriage, Dating & Love | Lex Fridman Podcast #366",366.0,https://www.youtube.com/watch?v=qtOKrG_wK5A&list=UUSHZKyawb77ixDdsGog4iWA&index=2,"Shannon Curry: Johnny Depp & Amber Heard Trial, Marriage, Dating & Love | Lex Fridman Podcast #366",https://i.ytimg.com/vi/qtOKrG_wK5A/hqdefault.jpg?sqp=-oaymwEcCNACELwBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLAiqef4-F5_6YRtYpYj-SNWc46fYw
diff --git a/scripts/get_data.ipynb b/scripts/get_data.ipynb
@@ -2,18 +2,9 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/site-packages/pinecone/index.py:4: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
-      "  from tqdm.autonotebook import tqdm\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import os,re\n",
     "import yt_dlp\n",
@@ -74,11 +65,12 @@
     "        stor_metadata.loc[v['title'],'number']=ep_number\n",
     "        stor_metadata.loc[v['title'],'link']=v['link']\n",
     "        stor_metadata.loc[v['title'],'title']=v['title']\n",
+    "        stor_metadata.loc[v['title'],'img']=v['thumbnails'][3]['url']\n",
     "    except:\n",
     "        print(\"Failed on %s\", v['title'])\n",
     "\n",
-    "# Filter for newer videos (Karpathy transcribed 1-325)\n",
-    "new_ep = stor_metadata[stor_metadata.number > 325]"
+    "last_complete_video = 365\n",
+    "new_ep = stor_metadata[stor_metadata.number > last_complete_video]"
    ]
   },
   {
@@ -103,7 +95,7 @@
     "    img_url=new_ep.loc[ix,'img']\n",
     "    ep_link=new_ep.loc[ix,'link']\n",
     "    # Write img \n",
-    "    with open(\"img/%s.jpg\"%str(ep_number), 'wb') as f:\n",
+    "    with open(\"../public/0%s.jpg\"%str(ep_number), 'wb') as f:\n",
     "        response = requests.get(img_url)\n",
     "        f.write(response.content)\n",
     "    # Write audio\n",
@@ -128,7 +120,14 @@
    "source": [
     "`3. Run Whisper -`\n",
     " \n",
-    "* On GPU, ideally: 10-20 min / video on 2080Ti with `medium` model"
+    "* On GPU, ideally: 10-20 min / video on 2080Ti with `medium` model\n",
+    "* Run `python run_whisper.py`\n",
+    "\n",
+    "If running this step on a remote machine:\n",
+    "* scp the transcription: `audio_transcription/episodes.csv`\n",
+    "* scp the audio files: `audio/*`\n",
+    "* Run `python run_whisper.py`\n",
+    "* Then, scp the `audio_transcription/` back to local "
    ]
   },
   {
@@ -364,10 +363,17 @@
    "source": [
     "# Join the list of lists \n",
     "splits_all = []\n",
-    "for sublist in [splits_scrape+splits_new]:\n",
+    "# For the initial write \n",
+    "# for sublist in [splits_scrape+splits_new]:\n",
+    "# For updates -- \n",
+    "for sublist in splits_new:\n",
     "    splits_all.extend(sublist)\n",
+    "\n",
     "metadatas_all = []\n",
-    "for sublist in [metadatas_scrape+metadatas_new]:\n",
+    "# For the initial write \n",
+    "# for sublist in [metadatas_scrape+metadatas_new]:\n",
+    "# For updates -- \n",
+    "for sublist in metadatas_new:\n",
     "    metadatas_all.extend(sublist)"
    ]
   },
@@ -390,14 +396,17 @@
     "    api_key=os.environ.get('PINECONE_API_KEY'),  \n",
     "    environment=\"us-east1-gcp\"  \n",
     ")\n",
+    "\n",
+    "# Initialize with small set of data - \n",
+    "# p = Pinecone.from_texts(splits_all[0:2], \n",
+    "#                        embeddings, \n",
+    "#                        index_name=index_name, \n",
+    "#                        metadatas=metadatas_all[0:2])\n",
+    "\n",
+    "# Update - \n",
     "index_name = \"lex-gpt\"\n",
     "embeddings = OpenAIEmbeddings()\n",
-    "\n",
-    "# Initialize with small set of data\n",
-    "p = Pinecone.from_texts(splits_all[0:2], \n",
-    "                        embeddings, \n",
-    "                        index_name=index_name, \n",
-    "                        metadatas=metadatas_all[0:2])"
+    "p = Pinecone.from_existing_index(index_name=index_name,embedding=embeddings)"
    ]
   },
   {
@@ -409,17 +418,17 @@
     "# Add data in chunk to avoid data ingest errors\n",
     "chunk_size = 100\n",
     "last_chunk = 0\n",
-    "num_chunks = math.ceil(len(splits_combined) / chunk_size)\n",
+    "num_chunks = math.ceil(len(splits_all) / chunk_size)\n",
     "for i in range(last_chunk,num_chunks):\n",
     "    \n",
     "    print(i)\n",
     "    start_time = time.time()\n",
     "    start_idx = i * chunk_size\n",
-    "    end_idx = min(start_idx + chunk_size, len(splits_combined))\n",
+    "    end_idx = min(start_idx + chunk_size, len(splits_all))\n",
     "    \n",
     "    # Extract the current chunk\n",
-    "    current_splits = splits_combined[start_idx:end_idx]\n",
-    "    current_metadatas = metadatas_combined[start_idx:end_idx]\n",
+    "    current_splits = splits_all[start_idx:end_idx]\n",
+    "    current_metadatas = metadatas_all[start_idx:end_idx]\n",
     "    \n",
     "    # Add the current chunk to the vector database\n",
     "    p.add_texts(texts = current_splits, metadatas=current_metadatas)\n",
@@ -448,7 +457,7 @@
     "    api_key=os.environ.get('PINECONE_API_KEY'),  \n",
     "    environment=\"us-east1-gcp\"  \n",
     ")\n",
-    "index_name = \"lex-gpt-new\"\n",
+    "index_name = \"lex-gpt\"\n",
     "embeddings = OpenAIEmbeddings()\n",
     "p = Pinecone.from_existing_index(index_name=index_name,embedding=embeddings)"
    ]
@@ -480,9 +489,16 @@
     "    print(\"--------\")\n",
     "\n",
     "llm = OpenAIChat(temperature=0)\n",
-    "q = \"What is the future path for AGI?\"\n",
-    "run_vectordb_sources_chain(llm,q,p)\n"
+    "q = \"What does Sam Altman think about GPT-4?\"\n",
+    "run_vectordb_sources_chain(llm,q,p)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

diff --git a/scripts/run_whisper.py b/scripts/run_whisper.py
@@ -21,7 +21,7 @@
 
     # get audio 
     audio_file_path='audio/%s.m4a'%str(ep_number)
-    out_file_path='audio_tx/"0"+%s.txt'%str(ep_number)
+    out_file_path='audio_transcription/"0"+%s.txt'%str(ep_number)
 
     print(f"Processing file: {audio_file_path}")
     logging.info(f"Processing file: {audio_file_path}")