forked from KeithGalli/Podcast-Downloader
-
Notifications
You must be signed in to change notification settings - Fork 0
/
podcast.py
140 lines (113 loc) · 6.08 KB
/
podcast.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import requests
import os
import re
from bs4 import BeautifulSoup
import openai
import numpy as np
from dotenv import load_dotenv, find_dotenv
import time
import json
import unicodedata
load_dotenv(find_dotenv())
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_base = os.getenv("API_BASE")
class Podcast:
def __init__(self, name, rss_feed_url):
self.name = name
self.rss_feed_url = rss_feed_url
self.download_directory = f'./Podcast-Downloader/downloads/{self.slugify(name)}'
if not os.path.exists(self.download_directory):
os.makedirs(self.download_directory)
self.transcription_directory = f'./Podcast-Downloader/transcripts/{self.slugify(name)}'
if not os.path.exists(self.transcription_directory):
os.makedirs(self.transcription_directory)
self.description_embeddings_path = f'./Podcast-Downloader/description_embeddings/{self.slugify(name)}.json'
def get_items(self):
page = requests.get(self.rss_feed_url)
soup = BeautifulSoup(page.text, 'xml')
return soup.find_all('item')
def get_embedding(self, text, model="text-embedding-ada-002"):
text = text.replace("\n", " ")
return openai.Embedding.create(input = text, model=model)['data'][0]['embedding']
def cosine_similarity(self, a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def simplify_title(self):
file_name = re.sub(r'[%/&!@#\*\$\?\+\^\\.\\\\]', '', self.name)[:100].replace(' ', '_')
return file_name
def slugify(self, value, allow_unicode=False):
"""
Taken from https://github.com/django/django/blob/master/django/utils/text.py
Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
dashes to single dashes. Remove characters that aren't alphanumerics,
underscores, or hyphens. Convert to lowercase. Also strip leading and
trailing whitespace, dashes, and underscores.
"""
value = str(value)
if allow_unicode:
value = unicodedata.normalize('NFKC', value)
else:
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
value = re.sub(r'[^\w\s-]', '', value.lower())
return re.sub(r'[-\s]+', '-', value).strip('-_')
def save_description_embeddings(self, description_embeddings):
description_embeddings_json = {'description_embeddings':description_embeddings}
with open(self.description_embeddings_path, 'w') as f:
json.dump(description_embeddings_json, f)
def add_description_embeddings(self, items_limit):
# Obtener el arreglo de description_embeddings
description_embeddings = self.get_description_embeddings()['description_embeddings']
# Obtener los episodios del podcast
items = self.get_items()
i = 0
for item in items:
title = item.find('title').text
# Obtener la descripción del episodio
description = item.find('description').text
soup = BeautifulSoup(description, 'html.parser')
description = "\n".join([p.get_text(strip=True) for p in soup.find_all('p')])
if (len(description_embeddings) == 0 or title not in [d['title'] for d in description_embeddings]) and i < items_limit:
# Dormir el lazo 8 segundos por cada 10 embeddings
if (i % 9 == 0):
time.sleep(8)
# Obtener el embedding de la descripción del episodio
description_embedding = self.get_embedding(description)
# Agregar la descripción del episodio con su embedding
description_embeddings += [{'title': title,
'description': description,
'embedding': description_embedding}]
i += 1
# Actualizar description_embeddings
self.save_description_embeddings(description_embeddings)
def get_description_embeddings(self):
description_embeddings = None
# Declarar el archivo de embeddings de las descripciones de los episodios
description_embeddings_dir = f'./Podcast-Downloader/description_embeddings'
if not os.path.exists(description_embeddings_dir):
os.mkdir(description_embeddings_dir)
if not os.path.exists(self.description_embeddings_path):
# Crear el json de description_embeddings
description_embeddings = []
self.save_description_embeddings(description_embeddings)
else:
# Cargar el archivo json de description_embeddings
with open(self.description_embeddings_path, 'r') as f:
description_embeddings = json.load(f)['description_embeddings']
return description_embeddings
def search_items(self, search_embedding, top_limit=2, items_limit=10):
# Obtener los episodios del podcast
items = self.get_items()
# Agregar description_embeddings
self.add_description_embeddings(items_limit)
description_embeddings = self.get_description_embeddings()
# Sorting de description_embeddings
sorted_description_embeddings = sorted(description_embeddings,
key=lambda x: self.cosine_similarity(x['embedding'], search_embedding),
reverse=True)
# Obtener los títulos de todos los podcasts
items_titles = [podcast.find('title').text for podcast in items]
# Obtener los episodios indexados por título
matched_podcasts = []
for description_embedding in sorted_description_embeddings[:top_limit]:
title_index = items_titles.index(description_embedding['title'])
matched_podcasts += [items[title_index]]
return matched_podcasts