Skip to content

Commit

Permalink
Add new episodes
Browse files Browse the repository at this point in the history
  • Loading branch information
barniker authored and rlm committed Mar 26, 2023
1 parent cbcdbd4 commit 71abda1
Show file tree
Hide file tree
Showing 7 changed files with 4,454 additions and 31 deletions.
Binary file added public/0366.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added public/0367.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2,458 changes: 2,458 additions & 0 deletions scripts/audio_transcription/0366.txt

Large diffs are not rendered by default.

1,946 changes: 1,946 additions & 0 deletions scripts/audio_transcription/0367.txt

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions scripts/audio_transcription/episodes.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
,index,number,link,title,img
0,"Sam Altman: OpenAI CEO on GPT-4, ChatGPT, and the Future of AI | Lex Fridman Podcast #367",367.0,https://www.youtube.com/watch?v=L_Guz73e6fw&list=UUSHZKyawb77ixDdsGog4iWA&index=1,"Sam Altman: OpenAI CEO on GPT-4, ChatGPT, and the Future of AI | Lex Fridman Podcast #367",https://i.ytimg.com/vi/L_Guz73e6fw/hqdefault.jpg?sqp=-oaymwEcCNACELwBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLBb0zSsh5mBrwsY8FfjV04i0x47Jg
1,"Shannon Curry: Johnny Depp & Amber Heard Trial, Marriage, Dating & Love | Lex Fridman Podcast #366",366.0,https://www.youtube.com/watch?v=qtOKrG_wK5A&list=UUSHZKyawb77ixDdsGog4iWA&index=2,"Shannon Curry: Johnny Depp & Amber Heard Trial, Marriage, Dating & Love | Lex Fridman Podcast #366",https://i.ytimg.com/vi/qtOKrG_wK5A/hqdefault.jpg?sqp=-oaymwEcCNACELwBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLAiqef4-F5_6YRtYpYj-SNWc46fYw
76 changes: 46 additions & 30 deletions scripts/get_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,9 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/site-packages/pinecone/index.py:4: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
" from tqdm.autonotebook import tqdm\n"
]
}
],
"outputs": [],
"source": [
"import os,re\n",
"import yt_dlp\n",
Expand Down Expand Up @@ -74,11 +65,12 @@
" stor_metadata.loc[v['title'],'number']=ep_number\n",
" stor_metadata.loc[v['title'],'link']=v['link']\n",
" stor_metadata.loc[v['title'],'title']=v['title']\n",
" stor_metadata.loc[v['title'],'img']=v['thumbnails'][3]['url']\n",
" except:\n",
" print(\"Failed on %s\", v['title'])\n",
"\n",
"# Filter for newer videos (Karpathy transcribed 1-325)\n",
"new_ep = stor_metadata[stor_metadata.number > 325]"
"last_complete_video = 365\n",
"new_ep = stor_metadata[stor_metadata.number > last_complete_video]"
]
},
{
Expand All @@ -103,7 +95,7 @@
" img_url=new_ep.loc[ix,'img']\n",
" ep_link=new_ep.loc[ix,'link']\n",
" # Write img \n",
" with open(\"img/%s.jpg\"%str(ep_number), 'wb') as f:\n",
" with open(\"../public/0%s.jpg\"%str(ep_number), 'wb') as f:\n",
" response = requests.get(img_url)\n",
" f.write(response.content)\n",
" # Write audio\n",
Expand All @@ -128,7 +120,14 @@
"source": [
"`3. Run Whisper -`\n",
" \n",
"* On GPU, ideally: 10-20 min / video on 2080Ti with `medium` model"
"* On GPU, ideally: 10-20 min / video on 2080Ti with `medium` model\n",
"* Run `python run_whisper.py`\n",
"\n",
"If running this step on a remote machine:\n",
"* scp the transcription: `audio_transcription/episodes.csv`\n",
"* scp the audio files: `audio/*`\n",
"* Run `python run_whisper.py`\n",
"* Then, scp the `audio_transcription/` back to local "
]
},
{
Expand Down Expand Up @@ -364,10 +363,17 @@
"source": [
"# Join the list of lists \n",
"splits_all = []\n",
"for sublist in [splits_scrape+splits_new]:\n",
"# For the initial write \n",
"# for sublist in [splits_scrape+splits_new]:\n",
"# For updates -- \n",
"for sublist in splits_new:\n",
" splits_all.extend(sublist)\n",
"\n",
"metadatas_all = []\n",
"for sublist in [metadatas_scrape+metadatas_new]:\n",
"# For the initial write \n",
"# for sublist in [metadatas_scrape+metadatas_new]:\n",
"# For updates -- \n",
"for sublist in metadatas_new:\n",
" metadatas_all.extend(sublist)"
]
},
Expand All @@ -390,14 +396,17 @@
" api_key=os.environ.get('PINECONE_API_KEY'), \n",
" environment=\"us-east1-gcp\" \n",
")\n",
"\n",
"# Initialize with small set of data - \n",
"# p = Pinecone.from_texts(splits_all[0:2], \n",
"# embeddings, \n",
"# index_name=index_name, \n",
"# metadatas=metadatas_all[0:2])\n",
"\n",
"# Update - \n",
"index_name = \"lex-gpt\"\n",
"embeddings = OpenAIEmbeddings()\n",
"\n",
"# Initialize with small set of data\n",
"p = Pinecone.from_texts(splits_all[0:2], \n",
" embeddings, \n",
" index_name=index_name, \n",
" metadatas=metadatas_all[0:2])"
"p = Pinecone.from_existing_index(index_name=index_name,embedding=embeddings)"
]
},
{
Expand All @@ -409,17 +418,17 @@
"# Add data in chunk to avoid data ingest errors\n",
"chunk_size = 100\n",
"last_chunk = 0\n",
"num_chunks = math.ceil(len(splits_combined) / chunk_size)\n",
"num_chunks = math.ceil(len(splits_all) / chunk_size)\n",
"for i in range(last_chunk,num_chunks):\n",
" \n",
" print(i)\n",
" start_time = time.time()\n",
" start_idx = i * chunk_size\n",
" end_idx = min(start_idx + chunk_size, len(splits_combined))\n",
" end_idx = min(start_idx + chunk_size, len(splits_all))\n",
" \n",
" # Extract the current chunk\n",
" current_splits = splits_combined[start_idx:end_idx]\n",
" current_metadatas = metadatas_combined[start_idx:end_idx]\n",
" current_splits = splits_all[start_idx:end_idx]\n",
" current_metadatas = metadatas_all[start_idx:end_idx]\n",
" \n",
" # Add the current chunk to the vector database\n",
" p.add_texts(texts = current_splits, metadatas=current_metadatas)\n",
Expand Down Expand Up @@ -448,7 +457,7 @@
" api_key=os.environ.get('PINECONE_API_KEY'), \n",
" environment=\"us-east1-gcp\" \n",
")\n",
"index_name = \"lex-gpt-new\"\n",
"index_name = \"lex-gpt\"\n",
"embeddings = OpenAIEmbeddings()\n",
"p = Pinecone.from_existing_index(index_name=index_name,embedding=embeddings)"
]
Expand Down Expand Up @@ -480,9 +489,16 @@
" print(\"--------\")\n",
"\n",
"llm = OpenAIChat(temperature=0)\n",
"q = \"What is the future path for AGI?\"\n",
"run_vectordb_sources_chain(llm,q,p)\n"
"q = \"What does Sam Altman think about GPT-4?\"\n",
"run_vectordb_sources_chain(llm,q,p)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
2 changes: 1 addition & 1 deletion scripts/run_whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

# get audio
audio_file_path='audio/%s.m4a'%str(ep_number)
out_file_path='audio_tx/"0"+%s.txt'%str(ep_number)
out_file_path='audio_transcription/"0"+%s.txt'%str(ep_number)

print(f"Processing file: {audio_file_path}")
logging.info(f"Processing file: {audio_file_path}")
Expand Down

2 comments on commit 71abda1

@vercel
Copy link

@vercel vercel bot commented on 71abda1 Mar 26, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@vercel
Copy link

@vercel vercel bot commented on 71abda1 Mar 26, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.