Commit
·
4b16400
0
Parent(s):
Duplicate from somuch4subtlety/pogcastGPT
Browse filesCo-authored-by: SoMuch4Subtlety <[email protected]>
- .gitattributes +34 -0
- README.md +25 -0
- app.py +108 -0
- requirements.txt +3 -0
.gitattributes
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: PogcastGPT
|
| 3 |
+
emoji: 💻
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: 1.10.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
duplicated_from: somuch4subtlety/pogcastGPT
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
This app uses semantic search to find and summarize relevant sections of the Pogcast to answer a user's question.
|
| 15 |
+
|
| 16 |
+
The process began by downloading and transcribing Pogcast episodes using [OpenAI’s Whisper](https://github.com/openai/whisper).
|
| 17 |
+
The transcriptions were then chunked into sections of ~500 words and each chunk was vectorized using [OpenAI’s embedding endpoint](https://beta.openai.com/docs/guides/embeddings).
|
| 18 |
+
The embeddings and text are then stored in a [vector database](Pinecone.io).
|
| 19 |
+
|
| 20 |
+
When you ask a question, the text is run through the embedding endpoint and then is compared to all of the vectorized sections using cosine similarity.
|
| 21 |
+
The top results are used as context and passed to [OpenAI’s GPT-3 completion endpoint](https://beta.openai.com/docs/api-reference/completions) along with your question and an explanation of how GPT-3 should answer the question.
|
| 22 |
+
Lastly, the summary answer and top matching sections are displayed.
|
| 23 |
+
|
| 24 |
+
Note
|
| 25 |
+
The parameters and completion prompt are set loosely and the bot is likely to hallucinate during its anwsers.
|
app.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pinecone
|
| 3 |
+
import openai
|
| 4 |
+
import uuid
|
| 5 |
+
|
| 6 |
+
@st.experimental_singleton
|
| 7 |
+
def init_pinecone():
|
| 8 |
+
pinecone.init(api_key=st.secrets["PINECONE_KEY"], environment="us-west1-gcp")
|
| 9 |
+
return pinecone.Index(st.secrets["PINECONE_INDEX"])
|
| 10 |
+
|
| 11 |
+
openai.organization = st.secrets["OPENAI_ORG"]
|
| 12 |
+
openai.api_key = st.secrets["OPENAI_KEY"]
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def modCheck(query):
|
| 16 |
+
response = openai.Moderation.create(input=query)
|
| 17 |
+
return response["results"][0]['flagged']
|
| 18 |
+
|
| 19 |
+
def promptMaker(query, matchtext, prompt_type=None):
|
| 20 |
+
prompt = "The Pogcast is a weekly podcast co-hosted by Veritas and Jesse Kazam. They are both twitch streamers and on the podcast they discuss all the poggers things in life like the first-person shooter Escape from Tarkov, chess, speed-running, and everyday activities relevant to being a twitch streamer.\n"
|
| 21 |
+
if not prompt_type:
|
| 22 |
+
prompt+= "You will be given relevant snippets from the Pogcast that should help you answer or provide context to an inquiry. \n" + \
|
| 23 |
+
"If the inquiry is in the form of a question, answer it in a verbose manner, provide a quote from the snippets to support your answer, and provide a deep summarization of the relevant portions of the snippets.\n" + \
|
| 24 |
+
"If the inquiry is not in the form of a question, summarize the parts of the snippets most relevant to the inquiry.\n" + \
|
| 25 |
+
"Snippets:\n" + matchtext +" \nInquiry: " + query + "\nResult:"
|
| 26 |
+
else:
|
| 27 |
+
prompt+= "Use the following snippets from the podcast to write a " + prompt_type + " about " + query + "\nSnippets: " + matchtext + "\nResult:"
|
| 28 |
+
return prompt
|
| 29 |
+
|
| 30 |
+
def runInquiry(query):
|
| 31 |
+
prompt_type = None
|
| 32 |
+
if query.startswith("/"):
|
| 33 |
+
prompt_type = query.split(" ")[0][1:]
|
| 34 |
+
query = " ".join(query.split(" ")[1:]).strip()
|
| 35 |
+
|
| 36 |
+
if len(query)< 6:
|
| 37 |
+
st.error("Please ask a question with at least 6 characters")
|
| 38 |
+
return
|
| 39 |
+
with st.spinner('Checking query...'):
|
| 40 |
+
flagged = modCheck(query)
|
| 41 |
+
if flagged:
|
| 42 |
+
st.error("You know what you did. I ain't answering that.")
|
| 43 |
+
return
|
| 44 |
+
|
| 45 |
+
with st.spinner('Embedding query...'):
|
| 46 |
+
xq = openai.Embedding.create(input=query, engine="text-embedding-ada-002")['data'][0]['embedding']
|
| 47 |
+
index = init_pinecone()
|
| 48 |
+
res = index.query(xq, namespace=st.secrets["PINECONE_NAMESPACE"], top_k=5, include_metadata=True)
|
| 49 |
+
with st.spinner('Thinking...'):
|
| 50 |
+
matchtext = "\n".join(match['metadata']['content'] for match in res['matches'][:3])
|
| 51 |
+
|
| 52 |
+
if 'uid' not in st.session_state:
|
| 53 |
+
st.session_state.uid = str(uuid.uuid4())
|
| 54 |
+
|
| 55 |
+
comp = openai.Completion.create(
|
| 56 |
+
model="text-davinci-003",
|
| 57 |
+
prompt=promptMaker(query, matchtext, prompt_type),
|
| 58 |
+
max_tokens=2000,
|
| 59 |
+
temperature=0.9,
|
| 60 |
+
user = st.session_state.uid
|
| 61 |
+
)
|
| 62 |
+
st.markdown(f"""
|
| 63 |
+
<div>
|
| 64 |
+
<p class="lead">{comp['choices'][0]['text']}</p>
|
| 65 |
+
</div>
|
| 66 |
+
""", unsafe_allow_html=True)
|
| 67 |
+
|
| 68 |
+
for context in res['matches']:
|
| 69 |
+
card(
|
| 70 |
+
context['metadata']['episode_num'],
|
| 71 |
+
context['metadata']['episode_id'],
|
| 72 |
+
context['metadata']['start_second'],
|
| 73 |
+
context['metadata']['end_second'],
|
| 74 |
+
context['metadata']['content']
|
| 75 |
+
)
|
| 76 |
+
return (comp, res['matches'])
|
| 77 |
+
|
| 78 |
+
def card(episode, episode_id, start_second, end_second, context):
|
| 79 |
+
return st.markdown(f"""
|
| 80 |
+
<div class="container-fluid mb-2">
|
| 81 |
+
<div class="row align-items-start">
|
| 82 |
+
<div class="col-md-4 col-sm-4">
|
| 83 |
+
<div class="position-relative">
|
| 84 |
+
<iframe width="220" height="124" src="https://www.youtube.com/embed/{episode_id}?start={int(start_second)}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
|
| 85 |
+
</div>
|
| 86 |
+
</div>
|
| 87 |
+
<div class="col-md-8 col-sm-8">
|
| 88 |
+
<a href=https://www.youtube.com/watch?v={episode_id}&t={int(start_second)}s>Episode {int(episode)}</a>
|
| 89 |
+
<br>
|
| 90 |
+
<span style="color: #808080;">
|
| 91 |
+
<small>{context[:200].capitalize()+"...."}</small>
|
| 92 |
+
</span>
|
| 93 |
+
</div>
|
| 94 |
+
</div>
|
| 95 |
+
</div>
|
| 96 |
+
""", unsafe_allow_html=True)
|
| 97 |
+
|
| 98 |
+
st.markdown("<h1 style='text-align: center;'>PogcastGPT</h1>", unsafe_allow_html=True)
|
| 99 |
+
st.write("""
|
| 100 |
+
This app uses semantic search to find and summarize relevant sections of the Pogcast to answer your question
|
| 101 |
+
""")
|
| 102 |
+
st.markdown("""
|
| 103 |
+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
|
| 104 |
+
""", unsafe_allow_html=True)
|
| 105 |
+
|
| 106 |
+
query = st.text_input(label="Ask me a question about the Pogcast!", max_chars=200, value="", key="inquiryBox", type='default')
|
| 107 |
+
if query != "":
|
| 108 |
+
runInquiry(query)
|
requirements.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pinecone-client
|
| 2 |
+
openai
|
| 3 |
+
streamlit
|