mirror of
https://github.com/khoj-ai/khoj.git
synced 2025-02-17 08:04:21 +00:00
[Multi-User Part 7]: Improve Sign-In UX & Rename DB Models for Readability (#528)
### ✨ New - Create profile pic drop-down menu in navigation pane Put settings page, logout action under drop-down menu ### ⚙️ Fix - Add Key icon for API keys table on Web Client's settings page ### 🧪 Improve - Rename `TextEmbeddings` to `TextEntries` for improved readability - Rename `Db.Models` `Embeddings`, `EmbeddingsAdapter` to `Entry`, `EntryAdapter` - Show truncated API key for identification & restrict table width for config page responsiveness
This commit is contained in:
commit
0fb81189ca
30 changed files with 498 additions and 326 deletions
|
@ -27,7 +27,7 @@ from database.models import (
|
|||
KhojApiUser,
|
||||
NotionConfig,
|
||||
GithubConfig,
|
||||
Embeddings,
|
||||
Entry,
|
||||
GithubRepoConfig,
|
||||
Conversation,
|
||||
ConversationProcessorConfig,
|
||||
|
@ -286,54 +286,54 @@ class ConversationAdapters:
|
|||
return await OpenAIProcessorConversationConfig.objects.filter(user=user).afirst()
|
||||
|
||||
|
||||
class EmbeddingsAdapters:
|
||||
class EntryAdapters:
|
||||
word_filer = WordFilter()
|
||||
file_filter = FileFilter()
|
||||
date_filter = DateFilter()
|
||||
|
||||
@staticmethod
|
||||
def does_embedding_exist(user: KhojUser, hashed_value: str) -> bool:
|
||||
return Embeddings.objects.filter(user=user, hashed_value=hashed_value).exists()
|
||||
def does_entry_exist(user: KhojUser, hashed_value: str) -> bool:
|
||||
return Entry.objects.filter(user=user, hashed_value=hashed_value).exists()
|
||||
|
||||
@staticmethod
|
||||
def delete_embedding_by_file(user: KhojUser, file_path: str):
|
||||
deleted_count, _ = Embeddings.objects.filter(user=user, file_path=file_path).delete()
|
||||
def delete_entry_by_file(user: KhojUser, file_path: str):
|
||||
deleted_count, _ = Entry.objects.filter(user=user, file_path=file_path).delete()
|
||||
return deleted_count
|
||||
|
||||
@staticmethod
|
||||
def delete_all_embeddings(user: KhojUser, file_type: str):
|
||||
deleted_count, _ = Embeddings.objects.filter(user=user, file_type=file_type).delete()
|
||||
def delete_all_entries(user: KhojUser, file_type: str):
|
||||
deleted_count, _ = Entry.objects.filter(user=user, file_type=file_type).delete()
|
||||
return deleted_count
|
||||
|
||||
@staticmethod
|
||||
def get_existing_entry_hashes_by_file(user: KhojUser, file_path: str):
|
||||
return Embeddings.objects.filter(user=user, file_path=file_path).values_list("hashed_value", flat=True)
|
||||
return Entry.objects.filter(user=user, file_path=file_path).values_list("hashed_value", flat=True)
|
||||
|
||||
@staticmethod
|
||||
def delete_embedding_by_hash(user: KhojUser, hashed_values: List[str]):
|
||||
Embeddings.objects.filter(user=user, hashed_value__in=hashed_values).delete()
|
||||
def delete_entry_by_hash(user: KhojUser, hashed_values: List[str]):
|
||||
Entry.objects.filter(user=user, hashed_value__in=hashed_values).delete()
|
||||
|
||||
@staticmethod
|
||||
def get_embeddings_by_date_filter(embeddings: BaseManager[Embeddings], start_date: date, end_date: date):
|
||||
return embeddings.filter(
|
||||
embeddingsdates__date__gte=start_date,
|
||||
embeddingsdates__date__lte=end_date,
|
||||
def get_entries_by_date_filter(entry: BaseManager[Entry], start_date: date, end_date: date):
|
||||
return entry.filter(
|
||||
entrydates__date__gte=start_date,
|
||||
entrydates__date__lte=end_date,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
async def user_has_embeddings(user: KhojUser):
|
||||
return await Embeddings.objects.filter(user=user).aexists()
|
||||
async def user_has_entries(user: KhojUser):
|
||||
return await Entry.objects.filter(user=user).aexists()
|
||||
|
||||
@staticmethod
|
||||
def apply_filters(user: KhojUser, query: str, file_type_filter: str = None):
|
||||
q_filter_terms = Q()
|
||||
|
||||
explicit_word_terms = EmbeddingsAdapters.word_filer.get_filter_terms(query)
|
||||
file_filters = EmbeddingsAdapters.file_filter.get_filter_terms(query)
|
||||
date_filters = EmbeddingsAdapters.date_filter.get_query_date_range(query)
|
||||
explicit_word_terms = EntryAdapters.word_filer.get_filter_terms(query)
|
||||
file_filters = EntryAdapters.file_filter.get_filter_terms(query)
|
||||
date_filters = EntryAdapters.date_filter.get_query_date_range(query)
|
||||
|
||||
if len(explicit_word_terms) == 0 and len(file_filters) == 0 and len(date_filters) == 0:
|
||||
return Embeddings.objects.filter(user=user)
|
||||
return Entry.objects.filter(user=user)
|
||||
|
||||
for term in explicit_word_terms:
|
||||
if term.startswith("+"):
|
||||
|
@ -354,32 +354,32 @@ class EmbeddingsAdapters:
|
|||
if min_date is not None:
|
||||
# Convert the min_date timestamp to yyyy-mm-dd format
|
||||
formatted_min_date = date.fromtimestamp(min_date).strftime("%Y-%m-%d")
|
||||
q_filter_terms &= Q(embeddings_dates__date__gte=formatted_min_date)
|
||||
q_filter_terms &= Q(entry_dates__date__gte=formatted_min_date)
|
||||
if max_date is not None:
|
||||
# Convert the max_date timestamp to yyyy-mm-dd format
|
||||
formatted_max_date = date.fromtimestamp(max_date).strftime("%Y-%m-%d")
|
||||
q_filter_terms &= Q(embeddings_dates__date__lte=formatted_max_date)
|
||||
q_filter_terms &= Q(entry_dates__date__lte=formatted_max_date)
|
||||
|
||||
relevant_embeddings = Embeddings.objects.filter(user=user).filter(
|
||||
relevant_entries = Entry.objects.filter(user=user).filter(
|
||||
q_filter_terms,
|
||||
)
|
||||
if file_type_filter:
|
||||
relevant_embeddings = relevant_embeddings.filter(file_type=file_type_filter)
|
||||
return relevant_embeddings
|
||||
relevant_entries = relevant_entries.filter(file_type=file_type_filter)
|
||||
return relevant_entries
|
||||
|
||||
@staticmethod
|
||||
def search_with_embeddings(
|
||||
user: KhojUser, embeddings: Tensor, max_results: int = 10, file_type_filter: str = None, raw_query: str = None
|
||||
):
|
||||
relevant_embeddings = EmbeddingsAdapters.apply_filters(user, raw_query, file_type_filter)
|
||||
relevant_embeddings = relevant_embeddings.filter(user=user).annotate(
|
||||
relevant_entries = EntryAdapters.apply_filters(user, raw_query, file_type_filter)
|
||||
relevant_entries = relevant_entries.filter(user=user).annotate(
|
||||
distance=CosineDistance("embeddings", embeddings)
|
||||
)
|
||||
if file_type_filter:
|
||||
relevant_embeddings = relevant_embeddings.filter(file_type=file_type_filter)
|
||||
relevant_embeddings = relevant_embeddings.order_by("distance")
|
||||
return relevant_embeddings[:max_results]
|
||||
relevant_entries = relevant_entries.filter(file_type=file_type_filter)
|
||||
relevant_entries = relevant_entries.order_by("distance")
|
||||
return relevant_entries[:max_results]
|
||||
|
||||
@staticmethod
|
||||
def get_unique_file_types(user: KhojUser):
|
||||
return Embeddings.objects.filter(user=user).values_list("file_type", flat=True).distinct()
|
||||
return Entry.objects.filter(user=user).values_list("file_type", flat=True).distinct()
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
# Generated by Django 4.2.5 on 2023-10-26 23:52
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("database", "0009_khojapiuser"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameModel(
|
||||
old_name="Embeddings",
|
||||
new_name="Entry",
|
||||
),
|
||||
migrations.RenameModel(
|
||||
old_name="EmbeddingsDates",
|
||||
new_name="EntryDates",
|
||||
),
|
||||
migrations.RenameField(
|
||||
model_name="entrydates",
|
||||
old_name="embeddings",
|
||||
new_name="entry",
|
||||
),
|
||||
migrations.RenameIndex(
|
||||
model_name="entrydates",
|
||||
new_name="database_en_date_8d823c_idx",
|
||||
old_name="database_em_date_a1ba47_idx",
|
||||
),
|
||||
]
|
|
@ -114,8 +114,8 @@ class Conversation(BaseModel):
|
|||
conversation_log = models.JSONField(default=dict)
|
||||
|
||||
|
||||
class Embeddings(BaseModel):
|
||||
class EmbeddingsType(models.TextChoices):
|
||||
class Entry(BaseModel):
|
||||
class EntryType(models.TextChoices):
|
||||
IMAGE = "image"
|
||||
PDF = "pdf"
|
||||
PLAINTEXT = "plaintext"
|
||||
|
@ -130,7 +130,7 @@ class Embeddings(BaseModel):
|
|||
raw = models.TextField()
|
||||
compiled = models.TextField()
|
||||
heading = models.CharField(max_length=1000, default=None, null=True, blank=True)
|
||||
file_type = models.CharField(max_length=30, choices=EmbeddingsType.choices, default=EmbeddingsType.PLAINTEXT)
|
||||
file_type = models.CharField(max_length=30, choices=EntryType.choices, default=EntryType.PLAINTEXT)
|
||||
file_path = models.CharField(max_length=400, default=None, null=True, blank=True)
|
||||
file_name = models.CharField(max_length=400, default=None, null=True, blank=True)
|
||||
url = models.URLField(max_length=400, default=None, null=True, blank=True)
|
||||
|
@ -138,9 +138,9 @@ class Embeddings(BaseModel):
|
|||
corpus_id = models.UUIDField(default=uuid.uuid4, editable=False)
|
||||
|
||||
|
||||
class EmbeddingsDates(BaseModel):
|
||||
class EntryDates(BaseModel):
|
||||
date = models.DateField()
|
||||
embeddings = models.ForeignKey(Embeddings, on_delete=models.CASCADE, related_name="embeddings_dates")
|
||||
entry = models.ForeignKey(Entry, on_delete=models.CASCADE, related_name="embeddings_dates")
|
||||
|
||||
class Meta:
|
||||
indexes = [
|
||||
|
|
4
src/khoj/interface/web/assets/icons/key.svg
Normal file
4
src/khoj/interface/web/assets/icons/key.svg
Normal file
|
@ -0,0 +1,4 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<svg width="800px" height="800px" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<path fill-rule="evenodd" clip-rule="evenodd" d="M22 8.29344C22 11.7692 19.1708 14.5869 15.6807 14.5869C15.0439 14.5869 13.5939 14.4405 12.8885 13.8551L12.0067 14.7333C11.4883 15.2496 11.6283 15.4016 11.8589 15.652C11.9551 15.7565 12.0672 15.8781 12.1537 16.0505C12.1537 16.0505 12.8885 17.075 12.1537 18.0995C11.7128 18.6849 10.4783 19.5045 9.06754 18.0995L8.77362 18.3922C8.77362 18.3922 9.65538 19.4167 8.92058 20.4412C8.4797 21.0267 7.30403 21.6121 6.27531 20.5876L5.2466 21.6121C4.54119 22.3146 3.67905 21.9048 3.33616 21.6121L2.45441 20.7339C1.63143 19.9143 2.1115 19.0264 2.45441 18.6849L10.0963 11.0743C10.0963 11.0743 9.3615 9.90338 9.3615 8.29344C9.3615 4.81767 12.1907 2 15.6807 2C19.1708 2 22 4.81767 22 8.29344ZM15.681 10.4889C16.8984 10.4889 17.8853 9.50601 17.8853 8.29353C17.8853 7.08105 16.8984 6.09814 15.681 6.09814C14.4635 6.09814 13.4766 7.08105 13.4766 8.29353C13.4766 9.50601 14.4635 10.4889 15.681 10.4889Z" fill="#1C274C"/>
|
||||
</svg>
|
After Width: | Height: | Size: 1.1 KiB |
|
@ -6,6 +6,8 @@
|
|||
--primary-hover: #ffa000;
|
||||
--primary-focus: rgba(255, 179, 0, 0.125);
|
||||
--primary-inverse: rgba(0, 0, 0, 0.75);
|
||||
--background-color: #fff;
|
||||
--main-text-color: #475569;
|
||||
}
|
||||
|
||||
/* Amber Dark scheme (Auto) */
|
||||
|
@ -16,6 +18,8 @@
|
|||
--primary-hover: #ffc107;
|
||||
--primary-focus: rgba(255, 179, 0, 0.25);
|
||||
--primary-inverse: rgba(0, 0, 0, 0.75);
|
||||
--background-color: #fff;
|
||||
--main-text-color: #475569;
|
||||
}
|
||||
}
|
||||
/* Amber Dark scheme (Forced) */
|
||||
|
@ -25,6 +29,8 @@
|
|||
--primary-hover: #ffc107;
|
||||
--primary-focus: rgba(255, 179, 0, 0.25);
|
||||
--primary-inverse: rgba(0, 0, 0, 0.75);
|
||||
--background-color: #fff;
|
||||
--main-text-color: #475569;
|
||||
}
|
||||
/* Amber (Common styles) */
|
||||
:root {
|
||||
|
@ -37,7 +43,8 @@
|
|||
.khoj-configure {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr;
|
||||
padding: 0 24px;
|
||||
font-family: roboto, karma, segoe ui, sans-serif;
|
||||
font-weight: 300;
|
||||
}
|
||||
.khoj-header {
|
||||
display: grid;
|
||||
|
@ -100,7 +107,84 @@ p#khoj-banner {
|
|||
display: inline;
|
||||
}
|
||||
|
||||
@media only screen and (max-width: 600px) {
|
||||
/* Dropdown in navigation menu*/
|
||||
#khoj-nav-menu-container {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
}
|
||||
.khoj-nav-dropdown-content {
|
||||
display: block;
|
||||
grid-auto-flow: row;
|
||||
position: absolute;
|
||||
background-color: var(--background-color);
|
||||
min-width: 160px;
|
||||
box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
|
||||
right: 15vw;
|
||||
top: 64px;
|
||||
z-index: 1;
|
||||
opacity: 0;
|
||||
transition: opacity 0.1s ease-in-out;
|
||||
pointer-events: none;
|
||||
text-align: left;
|
||||
}
|
||||
.khoj-nav-dropdown-content.show {
|
||||
opacity: 1;
|
||||
pointer-events: auto;
|
||||
}
|
||||
.khoj-nav-dropdown-content a {
|
||||
color: black;
|
||||
padding: 12px 16px;
|
||||
text-decoration: none;
|
||||
display: block;
|
||||
}
|
||||
.khoj-nav-dropdown-content a:hover {
|
||||
background-color: var(--primary-hover);
|
||||
}
|
||||
.khoj-nav-username {
|
||||
padding: 12px 16px;
|
||||
text-decoration: none;
|
||||
display: block;
|
||||
font-weight: bold;
|
||||
}
|
||||
.circle {
|
||||
border-radius: 50%;
|
||||
border: 2px solid var(--primary-inverse);
|
||||
width: 40px;
|
||||
height: 40px;
|
||||
vertical-align: text-top;
|
||||
padding: 3px;
|
||||
cursor: pointer;
|
||||
}
|
||||
.circle:hover {
|
||||
background-color: var(--primary-hover);
|
||||
}
|
||||
.user-initial {
|
||||
background-color: white;
|
||||
color: black;
|
||||
display: grid;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
font-size: 20px;
|
||||
box-sizing: unset;
|
||||
}
|
||||
|
||||
@media screen and (max-width: 700px) {
|
||||
.khoj-nav-dropdown-content {
|
||||
display: block;
|
||||
grid-auto-flow: row;
|
||||
position: absolute;
|
||||
background-color: var(--background-color);
|
||||
min-width: 160px;
|
||||
box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
|
||||
right: 10px;
|
||||
z-index: 1;
|
||||
opacity: 0;
|
||||
transition: opacity 0.1s ease-in-out;
|
||||
pointer-events: none;
|
||||
}
|
||||
}
|
||||
|
||||
@media only screen and (max-width: 700px) {
|
||||
div.khoj-header {
|
||||
display: grid;
|
||||
grid-auto-flow: column;
|
||||
|
|
15
src/khoj/interface/web/assets/khoj.js
Normal file
15
src/khoj/interface/web/assets/khoj.js
Normal file
|
@ -0,0 +1,15 @@
|
|||
// Toggle the navigation menu
|
||||
function toggleMenu() {
|
||||
var menu = document.getElementById("khoj-nav-menu");
|
||||
menu.classList.toggle("show");
|
||||
}
|
||||
|
||||
// Close the dropdown menu if the user clicks outside of it
|
||||
document.addEventListener('click', function(event) {
|
||||
let menu = document.getElementById("khoj-nav-menu");
|
||||
let menuContainer = document.getElementById("khoj-nav-menu-container");
|
||||
let isClickOnMenu = menuContainer.contains(event.target) || menuContainer === event.target;
|
||||
if (isClickOnMenu === false && menu.classList.contains("show")) {
|
||||
menu.classList.remove("show");
|
||||
}
|
||||
});
|
|
@ -8,19 +8,15 @@
|
|||
<link rel="stylesheet" href="/static/assets/pico.min.css">
|
||||
<link rel="stylesheet" href="/static/assets/khoj.css">
|
||||
</head>
|
||||
<script type="text/javascript" src="/static/assets/khoj.js"></script>
|
||||
<body class="khoj-configure">
|
||||
<div class="khoj-header-wrapper">
|
||||
<div class="filler"></div>
|
||||
<div class="khoj-header">
|
||||
<a class="khoj-logo" href="https://khoj.dev" target="_blank">
|
||||
<img class="khoj-logo" src="/static/assets/icons/khoj-logo-sideways-500.png" alt="Khoj"></img>
|
||||
</a>
|
||||
<nav class="khoj-nav">
|
||||
<a class="khoj-nav" href="/chat">Chat</a>
|
||||
<a class="khoj-nav" href="/">Search</a>
|
||||
<a class="khoj-nav khoj-nav-selected" href="/config">Settings</a>
|
||||
</nav>
|
||||
</div>
|
||||
|
||||
<!--Add Header Logo and Nav Pane-->
|
||||
{% import 'utils.html' as utils %}
|
||||
{{ utils.heading_pane(user_photo, username) }}
|
||||
|
||||
<div class="filler"></div>
|
||||
</div>
|
||||
<div class=”content”>
|
||||
|
@ -38,10 +34,15 @@
|
|||
img.khoj-logo {
|
||||
max-width: none!important;
|
||||
}
|
||||
div.khoj-header-wrapper{
|
||||
div.khoj-header-wrapper {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr min(70vw, 100%) 1fr;
|
||||
}
|
||||
img.circle {
|
||||
width: 49px;
|
||||
height: 49px;
|
||||
}
|
||||
|
||||
.page {
|
||||
display: grid;
|
||||
grid-auto-flow: row;
|
||||
|
@ -233,12 +234,12 @@
|
|||
height: 32px;
|
||||
}
|
||||
|
||||
@media screen and (max-width: 600px) {
|
||||
@media screen and (max-width: 700px) {
|
||||
.section-cards {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
}
|
||||
@media only screen and (max-width: 600px) {
|
||||
@media only screen and (max-width: 700px) {
|
||||
body {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr;
|
||||
|
@ -264,10 +265,9 @@
|
|||
width: 320px;
|
||||
}
|
||||
|
||||
div.khoj-header-wrapper{
|
||||
div.khoj-header-wrapper {
|
||||
grid-template-columns: auto;
|
||||
}
|
||||
|
||||
}
|
||||
</style>
|
||||
</html>
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
<link rel="manifest" href="/static/khoj_chat.webmanifest">
|
||||
<link rel="stylesheet" href="/static/assets/khoj.css">
|
||||
</head>
|
||||
<script type="text/javascript" src="/static/assets/khoj.js"></script>
|
||||
<script>
|
||||
let chatOptions = [];
|
||||
function copyProgrammaticOutput(event) {
|
||||
|
@ -269,25 +270,10 @@
|
|||
<button id="khoj-banner-submit" class="khoj-banner-button">Submit</button>
|
||||
{% endif %}
|
||||
</div>
|
||||
|
||||
<!--Add Header Logo and Nav Pane-->
|
||||
<div class="khoj-header">
|
||||
{% if demo %}
|
||||
<a class="khoj-logo" href="https://khoj.dev" target="_blank">
|
||||
<img class="khoj-logo" src="/static/assets/icons/khoj-logo-sideways-500.png" alt="Khoj"></img>
|
||||
</a>
|
||||
{% else %}
|
||||
<a class="khoj-logo" href="/">
|
||||
<img class="khoj-logo" src="/static/assets/icons/khoj-logo-sideways-500.png" alt="Khoj"></img>
|
||||
</a>
|
||||
{% endif %}
|
||||
<nav class="khoj-nav">
|
||||
<a class="khoj-nav khoj-nav-selected" href="/chat">Chat</a>
|
||||
<a class="khoj-nav" href="/">Search</a>
|
||||
{% if not demo %}
|
||||
<a class="khoj-nav" href="/config">Settings</a>
|
||||
{% endif %}
|
||||
</nav>
|
||||
</div>
|
||||
{% import 'utils.html' as utils %}
|
||||
{{ utils.heading_pane(user_photo, username) }}
|
||||
|
||||
<!-- Chat Body -->
|
||||
<div id="chat-body"></div>
|
||||
|
@ -309,8 +295,8 @@
|
|||
}
|
||||
body {
|
||||
display: grid;
|
||||
background: #fff;
|
||||
color: #475569;
|
||||
background: var(--background-color);
|
||||
color: var(--main-text-color);
|
||||
text-align: center;
|
||||
font-family: roboto, karma, segoe ui, sans-serif;
|
||||
font-size: 20px;
|
||||
|
@ -332,7 +318,7 @@
|
|||
content: attr(data-meta);
|
||||
display: block;
|
||||
font-size: x-small;
|
||||
color: #475569;
|
||||
color: var(--main-text-color);
|
||||
margin: -8px 4px 0 -5px;
|
||||
}
|
||||
/* move message by khoj to left */
|
||||
|
@ -402,7 +388,7 @@
|
|||
top: 91%;
|
||||
right: -2px;
|
||||
border: 10px solid transparent;
|
||||
border-left-color: #475569;
|
||||
border-left-color: var(--main-text-color);
|
||||
border-right: 0;
|
||||
margin-top: -10px;
|
||||
transform: rotate(-60deg)
|
||||
|
@ -418,7 +404,7 @@
|
|||
#chat-footer > * {
|
||||
padding: 15px;
|
||||
border-radius: 5px;
|
||||
border: 1px solid #475569;
|
||||
border: 1px solid var(--main-text-color);
|
||||
background: #f9fafc
|
||||
}
|
||||
.option:hover {
|
||||
|
@ -451,9 +437,9 @@
|
|||
}
|
||||
|
||||
a.inline-chat-link {
|
||||
color: #475569;
|
||||
color: var(--main-text-color);
|
||||
text-decoration: none;
|
||||
border-bottom: 1px dotted #475569;
|
||||
border-bottom: 1px dotted var(--main-text-color);
|
||||
}
|
||||
|
||||
@media (pointer: coarse), (hover: none) {
|
||||
|
@ -479,7 +465,7 @@
|
|||
padding: 2px 4px;
|
||||
}
|
||||
}
|
||||
@media only screen and (max-width: 600px) {
|
||||
@media only screen and (max-width: 700px) {
|
||||
body {
|
||||
grid-template-columns: 1fr;
|
||||
grid-template-rows: auto auto minmax(80px, 100%) auto;
|
||||
|
@ -499,7 +485,7 @@
|
|||
padding: 0;
|
||||
}
|
||||
}
|
||||
@media only screen and (min-width: 600px) {
|
||||
@media only screen and (min-width: 700px) {
|
||||
body {
|
||||
grid-template-columns: auto min(70vw, 100%) auto;
|
||||
grid-template-rows: auto auto minmax(80px, 100%) auto;
|
||||
|
@ -542,7 +528,7 @@
|
|||
input#khoj-banner-email {
|
||||
padding: 10px;
|
||||
border-radius: 5px;
|
||||
border: 1px solid #475569;
|
||||
border: 1px solid var(--main-text-color);
|
||||
background: #f9fafc;
|
||||
}
|
||||
|
||||
|
|
|
@ -3,11 +3,6 @@
|
|||
|
||||
<div class="page">
|
||||
<div class="section">
|
||||
{% if anonymous_mode == False %}
|
||||
<div>
|
||||
Logged in as {{ username }}
|
||||
</div>
|
||||
{% endif %}
|
||||
<h2 class="section-title">Plugins</h2>
|
||||
<div class="section-cards">
|
||||
<div class="card">
|
||||
|
@ -328,11 +323,6 @@
|
|||
<div class="finalize-buttons">
|
||||
<button id="reinitialize" type="submit" title="Regenerate index from scratch">🔄 Reinitialize</button>
|
||||
</div>
|
||||
{% if anonymous_mode == False %}
|
||||
<div class="finalize-buttons">
|
||||
<button id="logout" class="logout" onclick="window.location.href='/auth/logout'">Logout</button>
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
@ -541,16 +531,7 @@
|
|||
})
|
||||
.then(response => response.json())
|
||||
.then(tokenObj => {
|
||||
apiKeyList.innerHTML += `
|
||||
<tr id="api-key-item-${tokenObj.token}">
|
||||
<td><b>${tokenObj.name}</b></td>
|
||||
<td id="api-key-${tokenObj.token}">${tokenObj.token}</td>
|
||||
<td>
|
||||
<img id="api-key-copy-button-${tokenObj.token}" onclick="copyAPIKey('${tokenObj.token}')" class="configured-icon enabled" src="/static/assets/icons/copy-solid.svg" alt="Copy API Key">
|
||||
<img id="api-key-delete-button-${tokenObj.token}" onclick="deleteAPIKey('${tokenObj.token}')" class="configured-icon enabled" src="/static/assets/icons/trash-solid.svg" alt="Delete API Key">
|
||||
</td>
|
||||
</tr>
|
||||
`;
|
||||
apiKeyList.innerHTML += generateTokenRow(tokenObj);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -561,7 +542,7 @@
|
|||
const copyApiKeyButton = document.getElementById(`api-key-${token}`);
|
||||
original_html = copyApiKeyButton.innerHTML
|
||||
setTimeout(function() {
|
||||
copyApiKeyButton.innerHTML = "✅ Copied to your clipboard!";
|
||||
copyApiKeyButton.innerHTML = "✅ Copied!";
|
||||
setTimeout(function() {
|
||||
copyApiKeyButton.innerHTML = original_html;
|
||||
}, 1000);
|
||||
|
@ -581,23 +562,30 @@
|
|||
});
|
||||
}
|
||||
|
||||
function generateTokenRow(tokenObj) {
|
||||
let token = tokenObj.token;
|
||||
let tokenName = tokenObj.name;
|
||||
let truncatedToken = token.slice(0, 4) + "..." + token.slice(-4);
|
||||
let tokenId = `${tokenName}-${truncatedToken}`;
|
||||
return `
|
||||
<tr id="api-key-item-${token}">
|
||||
<td><b>${tokenName}</b></td>
|
||||
<td id="api-key-${token}">${truncatedToken}</td>
|
||||
<td>
|
||||
<img onclick="copyAPIKey('${token}')" class="configured-icon enabled" src="/static/assets/icons/copy-solid.svg" alt="Copy API Key" title="Copy API Key">
|
||||
<img onclick="deleteAPIKey('${token}')" class="configured-icon enabled" src="/static/assets/icons/trash-solid.svg" alt="Delete API Key" title="Delete API Key">
|
||||
</td>
|
||||
</tr>
|
||||
`;
|
||||
|
||||
}
|
||||
|
||||
function listApiKeys() {
|
||||
const apiKeyList = document.getElementById("api-key-list");
|
||||
fetch('/auth/token')
|
||||
.then(response => response.json())
|
||||
.then(tokens => {
|
||||
apiKeyList.innerHTML = tokens.map(tokenObj =>
|
||||
`
|
||||
<tr id="api-key-item-${tokenObj.token}">
|
||||
<td><b>${tokenObj.name}</b></td>
|
||||
<td id="api-key-${tokenObj.token}">${tokenObj.token}</td>
|
||||
<td>
|
||||
<img id="api-key-copy-button-${tokenObj.token}" onclick="copyAPIKey('${tokenObj.token}')" class="configured-icon enabled" src="/static/assets/icons/copy-solid.svg" alt="Copy API Key">
|
||||
<img id="api-key-delete-button-${tokenObj.token}" onclick="deleteAPIKey('${tokenObj.token}')" class="configured-icon enabled" src="/static/assets/icons/trash-solid.svg" alt="Delete API Key">
|
||||
</td>
|
||||
</tr>
|
||||
`)
|
||||
.join("");
|
||||
apiKeyList.innerHTML = tokens.map(generateTokenRow).join("");
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
</head>
|
||||
<script type="text/javascript" src="/static/assets/org.min.js"></script>
|
||||
<script type="text/javascript" src="/static/assets/markdown-it.min.js"></script>
|
||||
<script type="text/javascript" src="/static/assets/khoj.js"></script>
|
||||
|
||||
<script>
|
||||
function render_image(item) {
|
||||
|
@ -281,25 +282,10 @@
|
|||
<button id="khoj-banner-submit" class="khoj-banner-button">Submit</button>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<!--Add Header Logo and Nav Pane-->
|
||||
<div class="khoj-header">
|
||||
{% if demo %}
|
||||
<a class="khoj-logo" href="https://khoj.dev" target="_blank">
|
||||
<img class="khoj-logo" src="/static/assets/icons/khoj-logo-sideways-500.png" alt="Khoj"></img>
|
||||
</a>
|
||||
{% else %}
|
||||
<a class="khoj-logo" href="/">
|
||||
<img class="khoj-logo" src="/static/assets/icons/khoj-logo-sideways-500.png" alt="Khoj"></img>
|
||||
</a>
|
||||
{% endif %}
|
||||
<nav class="khoj-nav">
|
||||
<a class="khoj-nav" href="/chat">Chat</a>
|
||||
<a class="khoj-nav khoj-nav-selected" href="/">Search</a>
|
||||
{% if not demo %}
|
||||
<a class="khoj-nav" href="/config">Settings</a>
|
||||
{% endif %}
|
||||
</nav>
|
||||
</div>
|
||||
{% import 'utils.html' as utils %}
|
||||
{{ utils.heading_pane(user_photo, username) }}
|
||||
|
||||
<!--Add Text Box To Enter Query, Trigger Incremental Search OnChange -->
|
||||
<input type="text" id="query" class="option" onkeyup=incrementalSearch(event) autofocus="autofocus" placeholder="Search your knowledge base using natural language">
|
||||
|
@ -314,7 +300,7 @@
|
|||
</body>
|
||||
|
||||
<style>
|
||||
@media only screen and (max-width: 600px) {
|
||||
@media only screen and (max-width: 700px) {
|
||||
body {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr;
|
||||
|
@ -325,7 +311,7 @@
|
|||
grid-column: 1;
|
||||
}
|
||||
}
|
||||
@media only screen and (min-width: 600px) {
|
||||
@media only screen and (min-width: 700px) {
|
||||
body {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr min(70vw, 100%) 1fr;
|
||||
|
@ -339,8 +325,8 @@
|
|||
body {
|
||||
padding: 0px;
|
||||
margin: 0px;
|
||||
background: #fff;
|
||||
color: #475569;
|
||||
background: var(--background-color);
|
||||
color: var(--main-text-color);
|
||||
font-family: roboto, karma, segoe ui, sans-serif;
|
||||
font-size: 20px;
|
||||
font-weight: 300;
|
||||
|
@ -358,7 +344,7 @@
|
|||
#options > * {
|
||||
padding: 15px;
|
||||
border-radius: 5px;
|
||||
border: 1px solid #475569;
|
||||
border: 1px solid var(--main-text-color);
|
||||
background: #f9fafc
|
||||
}
|
||||
.option:hover {
|
||||
|
@ -386,7 +372,7 @@
|
|||
.image {
|
||||
width: 20vw;
|
||||
border-radius: 10px;
|
||||
border: 1px solid #475569;
|
||||
border: 1px solid var(--main-text-color);
|
||||
}
|
||||
#json {
|
||||
white-space: pre-wrap;
|
||||
|
@ -429,7 +415,7 @@
|
|||
padding: 3.5px 3.5px 0;
|
||||
margin-right: 5px;
|
||||
border-radius: 5px;
|
||||
border: 1px solid #475569;
|
||||
border: 1px solid var(--main-text-color);
|
||||
background-color: #ef4444;
|
||||
font-size: small;
|
||||
}
|
||||
|
@ -500,7 +486,7 @@
|
|||
input#khoj-banner-email {
|
||||
padding: 10px;
|
||||
border-radius: 5px;
|
||||
border: 1px solid #475569;
|
||||
border: 1px solid var(--main-text-color);
|
||||
background: #f9fafc;
|
||||
}
|
||||
|
||||
|
@ -509,7 +495,7 @@
|
|||
box-shadow: 0 0 11px #aaa;
|
||||
}
|
||||
|
||||
@media only screen and (max-width: 600px) {
|
||||
@media only screen and (max-width: 700px) {
|
||||
a.khoj-banner {
|
||||
display: block;
|
||||
}
|
||||
|
|
|
@ -58,7 +58,7 @@
|
|||
</body>
|
||||
|
||||
<style>
|
||||
@media only screen and (max-width: 600px) {
|
||||
@media only screen and (max-width: 700px) {
|
||||
body {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr;
|
||||
|
@ -69,7 +69,7 @@
|
|||
grid-column: 1;
|
||||
}
|
||||
}
|
||||
@media only screen and (min-width: 600px) {
|
||||
@media only screen and (min-width: 700px) {
|
||||
body {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr min(70vw, 100%) 1fr;
|
||||
|
@ -150,7 +150,7 @@
|
|||
font-size: x-large;
|
||||
}
|
||||
|
||||
@media only screen and (max-width: 600px) {
|
||||
@media only screen and (max-width: 700px) {
|
||||
a.khoj-banner {
|
||||
display: block;
|
||||
}
|
||||
|
|
24
src/khoj/interface/web/utils.html
Normal file
24
src/khoj/interface/web/utils.html
Normal file
|
@ -0,0 +1,24 @@
|
|||
{% macro heading_pane(user_photo, username) -%}
|
||||
<div class="khoj-header">
|
||||
<a class="khoj-logo" href="/" target="_blank">
|
||||
<img class="khoj-logo" src="/static/assets/icons/khoj-logo-sideways-500.png" alt="Khoj"></img>
|
||||
</a>
|
||||
<nav class="khoj-nav">
|
||||
<a class="khoj-nav" href="/chat">💬 Chat</a>
|
||||
<a class="khoj-nav" href="/">🔎 Search</a>
|
||||
<!-- Dropdown Menu -->
|
||||
<div id="khoj-nav-menu-container" class="khoj-nav dropdown">
|
||||
{% if user_photo and user_photo != "None" %}
|
||||
<img class="circle" src="{{ user_photo }}" alt="{{ username[0].upper() }}" onclick="toggleMenu()" referrerpolicy="no-referrer">
|
||||
{% else %}
|
||||
<div class="circle user-initial" alt="{{ username[0].upper() }}" onclick="toggleMenu()">{{ username[0].upper() }}</div>
|
||||
{% endif %}
|
||||
<div id="khoj-nav-menu" class="khoj-nav-dropdown-content">
|
||||
<div class="khoj-nav-username"> {{ username }} </div>
|
||||
<a class="khoj-nav khoj-nav-selected" href="/config">⚙️ Settings</a>
|
||||
<a class="khoj-nav" href="/auth/logout">🔑 Logout</a>
|
||||
</div>
|
||||
</div>
|
||||
</nav>
|
||||
</div>
|
||||
{%- endmacro %}
|
|
@ -10,17 +10,16 @@ import requests
|
|||
# Internal Packages
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig
|
||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
||||
from khoj.utils.rawconfig import Entry
|
||||
from database.models import Embeddings, GithubConfig, KhojUser
|
||||
from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries
|
||||
from khoj.processor.org_mode.org_to_entries import OrgToEntries
|
||||
from khoj.processor.text_to_entries import TextToEntries
|
||||
from database.models import Entry as DbEntry, GithubConfig, KhojUser
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GithubToJsonl(TextEmbeddings):
|
||||
class GithubToEntries(TextToEntries):
|
||||
def __init__(self, config: GithubConfig):
|
||||
super().__init__(config)
|
||||
raw_repos = config.githubrepoconfig.all()
|
||||
|
@ -78,24 +77,26 @@ class GithubToJsonl(TextEmbeddings):
|
|||
current_entries = []
|
||||
|
||||
with timer(f"Extract markdown entries from github repo {repo_shorthand}", logger):
|
||||
current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
|
||||
*GithubToJsonl.extract_markdown_entries(markdown_files)
|
||||
current_entries = MarkdownToEntries.convert_markdown_entries_to_maps(
|
||||
*GithubToEntries.extract_markdown_entries(markdown_files)
|
||||
)
|
||||
|
||||
with timer(f"Extract org entries from github repo {repo_shorthand}", logger):
|
||||
current_entries += OrgToJsonl.convert_org_nodes_to_entries(*GithubToJsonl.extract_org_entries(org_files))
|
||||
current_entries += OrgToEntries.convert_org_nodes_to_entries(
|
||||
*GithubToEntries.extract_org_entries(org_files)
|
||||
)
|
||||
|
||||
with timer(f"Extract commit messages from github repo {repo_shorthand}", logger):
|
||||
current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo)
|
||||
|
||||
with timer(f"Extract issues from github repo {repo_shorthand}", logger):
|
||||
issue_entries = GithubToJsonl.convert_issues_to_entries(
|
||||
*GithubToJsonl.extract_github_issues(self.get_issues(repo_url))
|
||||
issue_entries = GithubToEntries.convert_issues_to_entries(
|
||||
*GithubToEntries.extract_github_issues(self.get_issues(repo_url))
|
||||
)
|
||||
current_entries += issue_entries
|
||||
|
||||
with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger):
|
||||
current_entries = TextEmbeddings.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
||||
current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
||||
|
||||
return current_entries
|
||||
|
||||
|
@ -103,7 +104,7 @@ class GithubToJsonl(TextEmbeddings):
|
|||
# Identify, mark and merge any new entries with previous entries
|
||||
with timer("Identify new or updated entries", logger):
|
||||
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
|
||||
current_entries, Embeddings.EmbeddingsType.GITHUB, key="compiled", logger=logger, user=user
|
||||
current_entries, DbEntry.EntryType.GITHUB, key="compiled", logger=logger, user=user
|
||||
)
|
||||
|
||||
return num_new_embeddings, num_deleted_embeddings
|
||||
|
@ -281,7 +282,7 @@ class GithubToJsonl(TextEmbeddings):
|
|||
entries = []
|
||||
entry_to_file_map = []
|
||||
for doc in markdown_files:
|
||||
entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file(
|
||||
entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file(
|
||||
doc["content"], doc["path"], entries, entry_to_file_map
|
||||
)
|
||||
return entries, dict(entry_to_file_map)
|
||||
|
@ -292,7 +293,7 @@ class GithubToJsonl(TextEmbeddings):
|
|||
entry_to_file_map = []
|
||||
|
||||
for doc in org_files:
|
||||
entries, entry_to_file_map = OrgToJsonl.process_single_org_file(
|
||||
entries, entry_to_file_map = OrgToEntries.process_single_org_file(
|
||||
doc["content"], doc["path"], entries, entry_to_file_map
|
||||
)
|
||||
return entries, dict(entry_to_file_map)
|
|
@ -6,17 +6,17 @@ from pathlib import Path
|
|||
from typing import Tuple, List
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
||||
from khoj.processor.text_to_entries import TextToEntries
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.constants import empty_escape_sequences
|
||||
from khoj.utils.rawconfig import Entry
|
||||
from database.models import Embeddings, KhojUser
|
||||
from database.models import Entry as DbEntry, KhojUser
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MarkdownToJsonl(TextEmbeddings):
|
||||
class MarkdownToEntries(TextToEntries):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
|
@ -34,8 +34,8 @@ class MarkdownToJsonl(TextEmbeddings):
|
|||
|
||||
# Extract Entries from specified Markdown files
|
||||
with timer("Parse entries from Markdown files into dictionaries", logger):
|
||||
current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
|
||||
*MarkdownToJsonl.extract_markdown_entries(files)
|
||||
current_entries = MarkdownToEntries.convert_markdown_entries_to_maps(
|
||||
*MarkdownToEntries.extract_markdown_entries(files)
|
||||
)
|
||||
|
||||
# Split entries by max tokens supported by model
|
||||
|
@ -46,7 +46,7 @@ class MarkdownToJsonl(TextEmbeddings):
|
|||
with timer("Identify new or updated entries", logger):
|
||||
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
|
||||
current_entries,
|
||||
Embeddings.EmbeddingsType.MARKDOWN,
|
||||
DbEntry.EntryType.MARKDOWN,
|
||||
"compiled",
|
||||
logger,
|
||||
deletion_file_names,
|
||||
|
@ -67,7 +67,7 @@ class MarkdownToJsonl(TextEmbeddings):
|
|||
for markdown_file in markdown_files:
|
||||
try:
|
||||
markdown_content = markdown_files[markdown_file]
|
||||
entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file(
|
||||
entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file(
|
||||
markdown_content, markdown_file, entries, entry_to_file_map
|
||||
)
|
||||
except Exception as e:
|
|
@ -8,9 +8,9 @@ import requests
|
|||
# Internal Packages
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.rawconfig import Entry, NotionContentConfig
|
||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
||||
from khoj.processor.text_to_entries import TextToEntries
|
||||
from khoj.utils.rawconfig import Entry
|
||||
from database.models import Embeddings, KhojUser, NotionConfig
|
||||
from database.models import Entry as DbEntry, KhojUser, NotionConfig
|
||||
|
||||
from enum import Enum
|
||||
|
||||
|
@ -50,7 +50,7 @@ class NotionBlockType(Enum):
|
|||
CALLOUT = "callout"
|
||||
|
||||
|
||||
class NotionToJsonl(TextEmbeddings):
|
||||
class NotionToEntries(TextToEntries):
|
||||
def __init__(self, config: NotionConfig):
|
||||
super().__init__(config)
|
||||
self.config = NotionContentConfig(
|
||||
|
@ -250,7 +250,7 @@ class NotionToJsonl(TextEmbeddings):
|
|||
# Identify, mark and merge any new entries with previous entries
|
||||
with timer("Identify new or updated entries", logger):
|
||||
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
|
||||
current_entries, Embeddings.EmbeddingsType.NOTION, key="compiled", logger=logger, user=user
|
||||
current_entries, DbEntry.EntryType.NOTION, key="compiled", logger=logger, user=user
|
||||
)
|
||||
|
||||
return num_new_embeddings, num_deleted_embeddings
|
|
@ -5,17 +5,17 @@ from typing import Iterable, List, Tuple
|
|||
|
||||
# Internal Packages
|
||||
from khoj.processor.org_mode import orgnode
|
||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
||||
from khoj.processor.text_to_entries import TextToEntries
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.rawconfig import Entry
|
||||
from khoj.utils import state
|
||||
from database.models import Embeddings, KhojUser
|
||||
from database.models import Entry as DbEntry, KhojUser
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OrgToJsonl(TextEmbeddings):
|
||||
class OrgToEntries(TextToEntries):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
|
@ -47,7 +47,7 @@ class OrgToJsonl(TextEmbeddings):
|
|||
with timer("Identify new or updated entries", logger):
|
||||
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
|
||||
current_entries,
|
||||
Embeddings.EmbeddingsType.ORG,
|
||||
DbEntry.EntryType.ORG,
|
||||
"compiled",
|
||||
logger,
|
||||
deletion_file_names,
|
|
@ -8,16 +8,16 @@ import base64
|
|||
from langchain.document_loaders import PyMuPDFLoader
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
||||
from khoj.processor.text_to_entries import TextToEntries
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.rawconfig import Entry
|
||||
from database.models import Embeddings, KhojUser
|
||||
from database.models import Entry as DbEntry, KhojUser
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PdfToJsonl(TextEmbeddings):
|
||||
class PdfToEntries(TextToEntries):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
|
@ -35,7 +35,7 @@ class PdfToJsonl(TextEmbeddings):
|
|||
|
||||
# Extract Entries from specified Pdf files
|
||||
with timer("Parse entries from PDF files into dictionaries", logger):
|
||||
current_entries = PdfToJsonl.convert_pdf_entries_to_maps(*PdfToJsonl.extract_pdf_entries(files))
|
||||
current_entries = PdfToEntries.convert_pdf_entries_to_maps(*PdfToEntries.extract_pdf_entries(files))
|
||||
|
||||
# Split entries by max tokens supported by model
|
||||
with timer("Split entries by max token size supported by model", logger):
|
||||
|
@ -45,7 +45,7 @@ class PdfToJsonl(TextEmbeddings):
|
|||
with timer("Identify new or updated entries", logger):
|
||||
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
|
||||
current_entries,
|
||||
Embeddings.EmbeddingsType.PDF,
|
||||
DbEntry.EntryType.PDF,
|
||||
"compiled",
|
||||
logger,
|
||||
deletion_file_names,
|
|
@ -6,16 +6,16 @@ from bs4 import BeautifulSoup
|
|||
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
||||
from khoj.processor.text_to_entries import TextToEntries
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.rawconfig import Entry
|
||||
from database.models import Embeddings, KhojUser
|
||||
from database.models import Entry as DbEntry, KhojUser
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PlaintextToJsonl(TextEmbeddings):
|
||||
class PlaintextToEntries(TextToEntries):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
|
@ -35,7 +35,7 @@ class PlaintextToJsonl(TextEmbeddings):
|
|||
try:
|
||||
plaintext_content = files[file]
|
||||
if file.endswith(("html", "htm", "xml")):
|
||||
plaintext_content = PlaintextToJsonl.extract_html_content(
|
||||
plaintext_content = PlaintextToEntries.extract_html_content(
|
||||
plaintext_content, file.split(".")[-1]
|
||||
)
|
||||
files[file] = plaintext_content
|
||||
|
@ -45,7 +45,7 @@ class PlaintextToJsonl(TextEmbeddings):
|
|||
|
||||
# Extract Entries from specified plaintext files
|
||||
with timer("Parse entries from plaintext files", logger):
|
||||
current_entries = PlaintextToJsonl.convert_plaintext_entries_to_maps(files)
|
||||
current_entries = PlaintextToEntries.convert_plaintext_entries_to_maps(files)
|
||||
|
||||
# Split entries by max tokens supported by model
|
||||
with timer("Split entries by max token size supported by model", logger):
|
||||
|
@ -55,7 +55,7 @@ class PlaintextToJsonl(TextEmbeddings):
|
|||
with timer("Identify new or updated entries", logger):
|
||||
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
|
||||
current_entries,
|
||||
Embeddings.EmbeddingsType.PLAINTEXT,
|
||||
DbEntry.EntryType.PLAINTEXT,
|
||||
key="compiled",
|
||||
logger=logger,
|
||||
deletion_filenames=deletion_file_names,
|
|
@ -12,14 +12,14 @@ from khoj.utils.helpers import timer, batcher
|
|||
from khoj.utils.rawconfig import Entry
|
||||
from khoj.processor.embeddings import EmbeddingsModel
|
||||
from khoj.search_filter.date_filter import DateFilter
|
||||
from database.models import KhojUser, Embeddings, EmbeddingsDates
|
||||
from database.adapters import EmbeddingsAdapters
|
||||
from database.models import KhojUser, Entry as DbEntry, EntryDates
|
||||
from database.adapters import EntryAdapters
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextEmbeddings(ABC):
|
||||
class TextToEntries(ABC):
|
||||
def __init__(self, config: Any = None):
|
||||
self.embeddings_model = EmbeddingsModel()
|
||||
self.config = config
|
||||
|
@ -85,23 +85,23 @@ class TextEmbeddings(ABC):
|
|||
):
|
||||
with timer("Construct current entry hashes", logger):
|
||||
hashes_by_file = dict[str, set[str]]()
|
||||
current_entry_hashes = list(map(TextEmbeddings.hash_func(key), current_entries))
|
||||
current_entry_hashes = list(map(TextToEntries.hash_func(key), current_entries))
|
||||
hash_to_current_entries = dict(zip(current_entry_hashes, current_entries))
|
||||
for entry in tqdm(current_entries, desc="Hashing Entries"):
|
||||
hashes_by_file.setdefault(entry.file, set()).add(TextEmbeddings.hash_func(key)(entry))
|
||||
hashes_by_file.setdefault(entry.file, set()).add(TextToEntries.hash_func(key)(entry))
|
||||
|
||||
num_deleted_embeddings = 0
|
||||
with timer("Preparing dataset for regeneration", logger):
|
||||
if regenerate:
|
||||
logger.debug(f"Deleting all embeddings for file type {file_type}")
|
||||
num_deleted_embeddings = EmbeddingsAdapters.delete_all_embeddings(user, file_type)
|
||||
num_deleted_embeddings = EntryAdapters.delete_all_entries(user, file_type)
|
||||
|
||||
num_new_embeddings = 0
|
||||
with timer("Identify hashes for adding new entries", logger):
|
||||
for file in tqdm(hashes_by_file, desc="Processing file with hashed values"):
|
||||
hashes_for_file = hashes_by_file[file]
|
||||
hashes_to_process = set()
|
||||
existing_entries = Embeddings.objects.filter(
|
||||
existing_entries = DbEntry.objects.filter(
|
||||
user=user, hashed_value__in=hashes_for_file, file_type=file_type
|
||||
)
|
||||
existing_entry_hashes = set([entry.hashed_value for entry in existing_entries])
|
||||
|
@ -124,7 +124,7 @@ class TextEmbeddings(ABC):
|
|||
for entry_hash, embedding in entry_batch:
|
||||
entry = hash_to_current_entries[entry_hash]
|
||||
batch_embeddings_to_create.append(
|
||||
Embeddings(
|
||||
DbEntry(
|
||||
user=user,
|
||||
embeddings=embedding,
|
||||
raw=entry.raw,
|
||||
|
@ -136,7 +136,7 @@ class TextEmbeddings(ABC):
|
|||
corpus_id=entry.corpus_id,
|
||||
)
|
||||
)
|
||||
new_embeddings = Embeddings.objects.bulk_create(batch_embeddings_to_create)
|
||||
new_embeddings = DbEntry.objects.bulk_create(batch_embeddings_to_create)
|
||||
logger.debug(f"Created {len(new_embeddings)} new embeddings")
|
||||
num_new_embeddings += len(new_embeddings)
|
||||
|
||||
|
@ -146,26 +146,26 @@ class TextEmbeddings(ABC):
|
|||
dates = self.date_filter.extract_dates(embedding.raw)
|
||||
for date in dates:
|
||||
dates_to_create.append(
|
||||
EmbeddingsDates(
|
||||
EntryDates(
|
||||
date=date,
|
||||
embeddings=embedding,
|
||||
)
|
||||
)
|
||||
new_dates = EmbeddingsDates.objects.bulk_create(dates_to_create)
|
||||
new_dates = EntryDates.objects.bulk_create(dates_to_create)
|
||||
if len(new_dates) > 0:
|
||||
logger.debug(f"Created {len(new_dates)} new date entries")
|
||||
|
||||
with timer("Identify hashes for removed entries", logger):
|
||||
for file in hashes_by_file:
|
||||
existing_entry_hashes = EmbeddingsAdapters.get_existing_entry_hashes_by_file(user, file)
|
||||
existing_entry_hashes = EntryAdapters.get_existing_entry_hashes_by_file(user, file)
|
||||
to_delete_entry_hashes = set(existing_entry_hashes) - hashes_by_file[file]
|
||||
num_deleted_embeddings += len(to_delete_entry_hashes)
|
||||
EmbeddingsAdapters.delete_embedding_by_hash(user, hashed_values=list(to_delete_entry_hashes))
|
||||
EntryAdapters.delete_entry_by_hash(user, hashed_values=list(to_delete_entry_hashes))
|
||||
|
||||
with timer("Identify hashes for deleting entries", logger):
|
||||
if deletion_filenames is not None:
|
||||
for file_path in deletion_filenames:
|
||||
deleted_count = EmbeddingsAdapters.delete_embedding_by_file(user, file_path)
|
||||
deleted_count = EntryAdapters.delete_entry_by_file(user, file_path)
|
||||
num_deleted_embeddings += deleted_count
|
||||
|
||||
return num_new_embeddings, num_deleted_embeddings
|
||||
|
@ -180,11 +180,11 @@ class TextEmbeddings(ABC):
|
|||
):
|
||||
# Hash all current and previous entries to identify new entries
|
||||
with timer("Hash previous, current entries", logger):
|
||||
current_entry_hashes = list(map(TextEmbeddings.hash_func(key), current_entries))
|
||||
previous_entry_hashes = list(map(TextEmbeddings.hash_func(key), previous_entries))
|
||||
current_entry_hashes = list(map(TextToEntries.hash_func(key), current_entries))
|
||||
previous_entry_hashes = list(map(TextToEntries.hash_func(key), previous_entries))
|
||||
if deletion_filenames is not None:
|
||||
deletion_entries = [entry for entry in previous_entries if entry.file in deletion_filenames]
|
||||
deletion_entry_hashes = list(map(TextEmbeddings.hash_func(key), deletion_entries))
|
||||
deletion_entry_hashes = list(map(TextToEntries.hash_func(key), deletion_entries))
|
||||
else:
|
||||
deletion_entry_hashes = []
|
||||
|
|
@ -48,7 +48,7 @@ from khoj.processor.conversation.gpt4all.chat_model import extract_questions_off
|
|||
from fastapi.requests import Request
|
||||
|
||||
from database import adapters
|
||||
from database.adapters import EmbeddingsAdapters, ConversationAdapters
|
||||
from database.adapters import EntryAdapters, ConversationAdapters
|
||||
from database.models import LocalMarkdownConfig, LocalOrgConfig, LocalPdfConfig, LocalPlaintextConfig, KhojUser
|
||||
|
||||
|
||||
|
@ -129,7 +129,7 @@ if not state.demo:
|
|||
@requires(["authenticated"])
|
||||
def get_config_data(request: Request):
|
||||
user = request.user.object
|
||||
EmbeddingsAdapters.get_unique_file_types(user)
|
||||
EntryAdapters.get_unique_file_types(user)
|
||||
|
||||
return state.config
|
||||
|
||||
|
@ -145,7 +145,7 @@ if not state.demo:
|
|||
|
||||
configuration_update_metadata = {}
|
||||
|
||||
enabled_content = await sync_to_async(EmbeddingsAdapters.get_unique_file_types)(user)
|
||||
enabled_content = await sync_to_async(EntryAdapters.get_unique_file_types)(user)
|
||||
|
||||
if state.config.content_type is not None:
|
||||
configuration_update_metadata["github"] = "github" in enabled_content
|
||||
|
@ -241,9 +241,9 @@ if not state.demo:
|
|||
raise ValueError(f"Invalid content type: {content_type}")
|
||||
|
||||
await content_object.objects.filter(user=user).adelete()
|
||||
await sync_to_async(EmbeddingsAdapters.delete_all_embeddings)(user, content_type)
|
||||
await sync_to_async(EntryAdapters.delete_all_entries)(user, content_type)
|
||||
|
||||
enabled_content = await sync_to_async(EmbeddingsAdapters.get_unique_file_types)(user)
|
||||
enabled_content = await sync_to_async(EntryAdapters.get_unique_file_types)(user)
|
||||
return {"status": "ok"}
|
||||
|
||||
@api.post("/delete/config/data/processor/conversation/openai", status_code=200)
|
||||
|
@ -372,7 +372,7 @@ def get_config_types(
|
|||
):
|
||||
user = request.user.object
|
||||
|
||||
enabled_file_types = EmbeddingsAdapters.get_unique_file_types(user)
|
||||
enabled_file_types = EntryAdapters.get_unique_file_types(user)
|
||||
|
||||
configured_content_types = list(enabled_file_types)
|
||||
|
||||
|
@ -706,7 +706,7 @@ async def extract_references_and_questions(
|
|||
if conversation_type == ConversationCommand.General:
|
||||
return compiled_references, inferred_queries, q
|
||||
|
||||
if not await EmbeddingsAdapters.user_has_embeddings(user=user):
|
||||
if not await EntryAdapters.user_has_entries(user=user):
|
||||
logger.warning(
|
||||
"No content index loaded, so cannot extract references from knowledge base. Please configure your data sources and update the index to chat with your notes."
|
||||
)
|
||||
|
|
|
@ -10,12 +10,12 @@ from starlette.authentication import requires
|
|||
|
||||
# Internal Packages
|
||||
from khoj.utils import state, constants
|
||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
||||
from khoj.processor.github.github_to_jsonl import GithubToJsonl
|
||||
from khoj.processor.notion.notion_to_jsonl import NotionToJsonl
|
||||
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
|
||||
from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries
|
||||
from khoj.processor.org_mode.org_to_entries import OrgToEntries
|
||||
from khoj.processor.pdf.pdf_to_entries import PdfToEntries
|
||||
from khoj.processor.github.github_to_entries import GithubToEntries
|
||||
from khoj.processor.notion.notion_to_entries import NotionToEntries
|
||||
from khoj.processor.plaintext.plaintext_to_entries import PlaintextToEntries
|
||||
from khoj.search_type import text_search, image_search
|
||||
from khoj.routers.helpers import update_telemetry_state
|
||||
from khoj.utils.yaml import save_config_to_file_updated_state
|
||||
|
@ -201,7 +201,7 @@ def configure_content(
|
|||
logger.info("🦄 Setting up search for orgmode notes")
|
||||
# Extract Entries, Generate Notes Embeddings
|
||||
text_search.setup(
|
||||
OrgToJsonl,
|
||||
OrgToEntries,
|
||||
files.get("org"),
|
||||
regenerate=regenerate,
|
||||
full_corpus=full_corpus,
|
||||
|
@ -216,7 +216,7 @@ def configure_content(
|
|||
logger.info("💎 Setting up search for markdown notes")
|
||||
# Extract Entries, Generate Markdown Embeddings
|
||||
text_search.setup(
|
||||
MarkdownToJsonl,
|
||||
MarkdownToEntries,
|
||||
files.get("markdown"),
|
||||
regenerate=regenerate,
|
||||
full_corpus=full_corpus,
|
||||
|
@ -232,7 +232,7 @@ def configure_content(
|
|||
logger.info("🖨️ Setting up search for pdf")
|
||||
# Extract Entries, Generate PDF Embeddings
|
||||
text_search.setup(
|
||||
PdfToJsonl,
|
||||
PdfToEntries,
|
||||
files.get("pdf"),
|
||||
regenerate=regenerate,
|
||||
full_corpus=full_corpus,
|
||||
|
@ -248,7 +248,7 @@ def configure_content(
|
|||
logger.info("📄 Setting up search for plaintext")
|
||||
# Extract Entries, Generate Plaintext Embeddings
|
||||
text_search.setup(
|
||||
PlaintextToJsonl,
|
||||
PlaintextToEntries,
|
||||
files.get("plaintext"),
|
||||
regenerate=regenerate,
|
||||
full_corpus=full_corpus,
|
||||
|
@ -281,7 +281,7 @@ def configure_content(
|
|||
logger.info("🐙 Setting up search for github")
|
||||
# Extract Entries, Generate Github Embeddings
|
||||
text_search.setup(
|
||||
GithubToJsonl,
|
||||
GithubToEntries,
|
||||
None,
|
||||
regenerate=regenerate,
|
||||
full_corpus=full_corpus,
|
||||
|
@ -298,7 +298,7 @@ def configure_content(
|
|||
if (search_type == None or search_type in state.SearchType.Notion.value) and notion_config:
|
||||
logger.info("🔌 Setting up search for notion")
|
||||
text_search.setup(
|
||||
NotionToJsonl,
|
||||
NotionToEntries,
|
||||
None,
|
||||
regenerate=regenerate,
|
||||
full_corpus=full_corpus,
|
||||
|
|
|
@ -19,7 +19,7 @@ from khoj.utils.rawconfig import (
|
|||
|
||||
# Internal Packages
|
||||
from khoj.utils import constants, state
|
||||
from database.adapters import EmbeddingsAdapters, get_user_github_config, get_user_notion_config, ConversationAdapters
|
||||
from database.adapters import EntryAdapters, get_user_github_config, get_user_notion_config, ConversationAdapters
|
||||
from database.models import LocalOrgConfig, LocalMarkdownConfig, LocalPdfConfig, LocalPlaintextConfig
|
||||
|
||||
|
||||
|
@ -34,19 +34,52 @@ VALID_TEXT_CONTENT_TYPES = ["org", "markdown", "pdf", "plaintext"]
|
|||
@web_client.get("/", response_class=FileResponse)
|
||||
@requires(["authenticated"], redirect="login_page")
|
||||
def index(request: Request):
|
||||
return templates.TemplateResponse("index.html", context={"request": request, "demo": state.demo})
|
||||
user = request.user.object
|
||||
user_picture = request.session.get("user", {}).get("picture")
|
||||
|
||||
return templates.TemplateResponse(
|
||||
"index.html",
|
||||
context={
|
||||
"request": request,
|
||||
"demo": state.demo,
|
||||
"username": user.username,
|
||||
"user_photo": user_picture,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@web_client.post("/", response_class=FileResponse)
|
||||
@requires(["authenticated"], redirect="login_page")
|
||||
def index_post(request: Request):
|
||||
return templates.TemplateResponse("index.html", context={"request": request, "demo": state.demo})
|
||||
user = request.user.object
|
||||
user_picture = request.session.get("user", {}).get("picture")
|
||||
|
||||
return templates.TemplateResponse(
|
||||
"index.html",
|
||||
context={
|
||||
"request": request,
|
||||
"demo": state.demo,
|
||||
"username": user.username,
|
||||
"user_photo": user_picture,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@web_client.get("/chat", response_class=FileResponse)
|
||||
@requires(["authenticated"], redirect="login_page")
|
||||
def chat_page(request: Request):
|
||||
return templates.TemplateResponse("chat.html", context={"request": request, "demo": state.demo})
|
||||
user = request.user.object
|
||||
user_picture = request.session.get("user", {}).get("picture")
|
||||
|
||||
return templates.TemplateResponse(
|
||||
"chat.html",
|
||||
context={
|
||||
"request": request,
|
||||
"demo": state.demo,
|
||||
"username": user.username,
|
||||
"user_photo": user_picture,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@web_client.get("/login", response_class=FileResponse)
|
||||
|
@ -84,7 +117,8 @@ if not state.demo:
|
|||
@requires(["authenticated"], redirect="login_page")
|
||||
def config_page(request: Request):
|
||||
user = request.user.object
|
||||
enabled_content = set(EmbeddingsAdapters.get_unique_file_types(user).all())
|
||||
user_picture = request.session.get("user", {}).get("picture")
|
||||
enabled_content = set(EntryAdapters.get_unique_file_types(user).all())
|
||||
default_full_config = FullConfig(
|
||||
content_type=None,
|
||||
search_type=None,
|
||||
|
@ -128,7 +162,8 @@ if not state.demo:
|
|||
"current_config": current_config,
|
||||
"current_model_state": successfully_configured,
|
||||
"anonymous_mode": state.anonymous_mode,
|
||||
"username": user.username if user else None,
|
||||
"username": user.username,
|
||||
"user_photo": user_picture,
|
||||
},
|
||||
)
|
||||
|
||||
|
@ -136,6 +171,7 @@ if not state.demo:
|
|||
@requires(["authenticated"], redirect="login_page")
|
||||
def github_config_page(request: Request):
|
||||
user = request.user.object
|
||||
user_picture = request.session.get("user", {}).get("picture")
|
||||
current_github_config = get_user_github_config(user)
|
||||
|
||||
if current_github_config:
|
||||
|
@ -158,13 +194,20 @@ if not state.demo:
|
|||
current_config = {} # type: ignore
|
||||
|
||||
return templates.TemplateResponse(
|
||||
"content_type_github_input.html", context={"request": request, "current_config": current_config}
|
||||
"content_type_github_input.html",
|
||||
context={
|
||||
"request": request,
|
||||
"current_config": current_config,
|
||||
"username": user.username,
|
||||
"user_photo": user_picture,
|
||||
},
|
||||
)
|
||||
|
||||
@web_client.get("/config/content_type/notion", response_class=HTMLResponse)
|
||||
@requires(["authenticated"], redirect="login_page")
|
||||
def notion_config_page(request: Request):
|
||||
user = request.user.object
|
||||
user_picture = request.session.get("user", {}).get("picture")
|
||||
current_notion_config = get_user_notion_config(user)
|
||||
|
||||
current_config = NotionContentConfig(
|
||||
|
@ -174,7 +217,13 @@ if not state.demo:
|
|||
current_config = json.loads(current_config.json())
|
||||
|
||||
return templates.TemplateResponse(
|
||||
"content_type_notion_input.html", context={"request": request, "current_config": current_config}
|
||||
"content_type_notion_input.html",
|
||||
context={
|
||||
"request": request,
|
||||
"current_config": current_config,
|
||||
"username": user.username,
|
||||
"user_photo": user_picture,
|
||||
},
|
||||
)
|
||||
|
||||
@web_client.get("/config/content_type/{content_type}", response_class=HTMLResponse)
|
||||
|
@ -185,6 +234,7 @@ if not state.demo:
|
|||
|
||||
object = map_config_to_object(content_type)
|
||||
user = request.user.object
|
||||
user_picture = request.session.get("user", {}).get("picture")
|
||||
config = object.objects.filter(user=user).first()
|
||||
if config == None:
|
||||
config = object.objects.create(user=user)
|
||||
|
@ -202,6 +252,8 @@ if not state.demo:
|
|||
"request": request,
|
||||
"current_config": current_config,
|
||||
"content_type": content_type,
|
||||
"username": user.username,
|
||||
"user_photo": user_picture,
|
||||
},
|
||||
)
|
||||
|
||||
|
@ -209,6 +261,7 @@ if not state.demo:
|
|||
@requires(["authenticated"], redirect="login_page")
|
||||
def conversation_processor_config_page(request: Request):
|
||||
user = request.user.object
|
||||
user_picture = request.session.get("user", {}).get("picture")
|
||||
openai_config = ConversationAdapters.get_openai_conversation_config(user)
|
||||
|
||||
if openai_config:
|
||||
|
@ -229,5 +282,7 @@ if not state.demo:
|
|||
context={
|
||||
"request": request,
|
||||
"current_config": current_processor_openai_config,
|
||||
"username": user.username,
|
||||
"user_photo": user_picture,
|
||||
},
|
||||
)
|
||||
|
|
|
@ -6,31 +6,31 @@ from typing import List, Tuple, Type, Union, Dict
|
|||
|
||||
# External Packages
|
||||
import torch
|
||||
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
||||
from sentence_transformers import util
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
|
||||
|
||||
# Internal Packages
|
||||
from khoj.utils import state
|
||||
from khoj.utils.helpers import get_absolute_path, resolve_absolute_path, load_model, timer
|
||||
from khoj.utils.helpers import get_absolute_path, timer
|
||||
from khoj.utils.models import BaseEncoder
|
||||
from khoj.utils.state import SearchType
|
||||
from khoj.utils.rawconfig import SearchResponse, Entry
|
||||
from khoj.utils.jsonl import load_jsonl
|
||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
||||
from database.adapters import EmbeddingsAdapters
|
||||
from database.models import KhojUser, Embeddings
|
||||
from khoj.processor.text_to_entries import TextToEntries
|
||||
from database.adapters import EntryAdapters
|
||||
from database.models import KhojUser, Entry as DbEntry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
search_type_to_embeddings_type = {
|
||||
SearchType.Org.value: Embeddings.EmbeddingsType.ORG,
|
||||
SearchType.Markdown.value: Embeddings.EmbeddingsType.MARKDOWN,
|
||||
SearchType.Plaintext.value: Embeddings.EmbeddingsType.PLAINTEXT,
|
||||
SearchType.Pdf.value: Embeddings.EmbeddingsType.PDF,
|
||||
SearchType.Github.value: Embeddings.EmbeddingsType.GITHUB,
|
||||
SearchType.Notion.value: Embeddings.EmbeddingsType.NOTION,
|
||||
SearchType.Org.value: DbEntry.EntryType.ORG,
|
||||
SearchType.Markdown.value: DbEntry.EntryType.MARKDOWN,
|
||||
SearchType.Plaintext.value: DbEntry.EntryType.PLAINTEXT,
|
||||
SearchType.Pdf.value: DbEntry.EntryType.PDF,
|
||||
SearchType.Github.value: DbEntry.EntryType.GITHUB,
|
||||
SearchType.Notion.value: DbEntry.EntryType.NOTION,
|
||||
SearchType.All.value: None,
|
||||
}
|
||||
|
||||
|
@ -121,7 +121,7 @@ async def query(
|
|||
# Find relevant entries for the query
|
||||
top_k = 10
|
||||
with timer("Search Time", logger, state.device):
|
||||
hits = EmbeddingsAdapters.search_with_embeddings(
|
||||
hits = EntryAdapters.search_with_embeddings(
|
||||
user=user,
|
||||
embeddings=question_embedding,
|
||||
max_results=top_k,
|
||||
|
@ -188,7 +188,7 @@ def rerank_and_sort_results(hits, query):
|
|||
|
||||
|
||||
def setup(
|
||||
text_to_jsonl: Type[TextEmbeddings],
|
||||
text_to_entries: Type[TextToEntries],
|
||||
files: dict[str, str],
|
||||
regenerate: bool,
|
||||
full_corpus: bool = True,
|
||||
|
@ -196,11 +196,11 @@ def setup(
|
|||
config=None,
|
||||
) -> None:
|
||||
if config:
|
||||
num_new_embeddings, num_deleted_embeddings = text_to_jsonl(config).process(
|
||||
num_new_embeddings, num_deleted_embeddings = text_to_entries(config).process(
|
||||
files=files, full_corpus=full_corpus, user=user, regenerate=regenerate
|
||||
)
|
||||
else:
|
||||
num_new_embeddings, num_deleted_embeddings = text_to_jsonl().process(
|
||||
num_new_embeddings, num_deleted_embeddings = text_to_entries().process(
|
||||
files=files, full_corpus=full_corpus, user=user, regenerate=regenerate
|
||||
)
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@ app = FastAPI()
|
|||
|
||||
# Internal Packages
|
||||
from khoj.configure import configure_routes, configure_search_types, configure_middleware
|
||||
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
|
||||
from khoj.processor.plaintext.plaintext_to_entries import PlaintextToEntries
|
||||
from khoj.search_type import image_search, text_search
|
||||
from khoj.utils.config import SearchModels
|
||||
from khoj.utils.constants import web_directory
|
||||
|
@ -26,7 +26,7 @@ from khoj.utils.rawconfig import (
|
|||
)
|
||||
from khoj.utils import state, fs_syncer
|
||||
from khoj.routers.indexer import configure_content
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from khoj.processor.org_mode.org_to_entries import OrgToEntries
|
||||
from database.models import (
|
||||
KhojApiUser,
|
||||
LocalOrgConfig,
|
||||
|
@ -134,7 +134,7 @@ def content_config(tmp_path_factory, search_models: SearchModels, default_user:
|
|||
user=default_user,
|
||||
)
|
||||
|
||||
text_search.setup(OrgToJsonl, get_sample_data("org"), regenerate=False, user=default_user)
|
||||
text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=False, user=default_user)
|
||||
|
||||
if os.getenv("GITHUB_PAT_TOKEN"):
|
||||
GithubConfig.objects.create(
|
||||
|
@ -242,7 +242,7 @@ def client(
|
|||
# These lines help us Mock the Search models for these search types
|
||||
state.search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
text_search.setup(
|
||||
OrgToJsonl,
|
||||
OrgToEntries,
|
||||
get_sample_data("org"),
|
||||
regenerate=False,
|
||||
user=api_user.user,
|
||||
|
@ -251,7 +251,7 @@ def client(
|
|||
content_config.image, state.search_models.image_search, regenerate=False
|
||||
)
|
||||
text_search.setup(
|
||||
PlaintextToJsonl,
|
||||
PlaintextToEntries,
|
||||
get_sample_data("plaintext"),
|
||||
regenerate=False,
|
||||
user=api_user.user,
|
||||
|
|
|
@ -15,9 +15,9 @@ from khoj.utils import state
|
|||
from khoj.utils.state import search_models, content_index, config
|
||||
from khoj.search_type import text_search, image_search
|
||||
from khoj.utils.rawconfig import ContentConfig, SearchConfig
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from khoj.processor.org_mode.org_to_entries import OrgToEntries
|
||||
from database.models import KhojUser
|
||||
from database.adapters import EmbeddingsAdapters
|
||||
from database.adapters import EntryAdapters
|
||||
|
||||
|
||||
# Test
|
||||
|
@ -176,9 +176,9 @@ def test_regenerate_with_github_fails_without_pat(client):
|
|||
@pytest.mark.skip(reason="Flaky test on parallel test runs")
|
||||
def test_get_configured_types_via_api(client, sample_org_data):
|
||||
# Act
|
||||
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False)
|
||||
text_search.setup(OrgToEntries, sample_org_data, regenerate=False)
|
||||
|
||||
enabled_types = EmbeddingsAdapters.get_unique_file_types(user=None).all().values_list("file_type", flat=True)
|
||||
enabled_types = EntryAdapters.get_unique_file_types(user=None).all().values_list("file_type", flat=True)
|
||||
|
||||
# Assert
|
||||
assert list(enabled_types) == ["org"]
|
||||
|
@ -189,7 +189,7 @@ def test_get_configured_types_via_api(client, sample_org_data):
|
|||
def test_get_api_config_types(client, sample_org_data, default_user: KhojUser):
|
||||
# Arrange
|
||||
headers = {"Authorization": "Bearer kk-secret"}
|
||||
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user)
|
||||
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
|
||||
|
||||
# Act
|
||||
response = client.get(f"/api/config/types", headers=headers)
|
||||
|
@ -255,7 +255,7 @@ def test_image_search(client, content_config: ContentConfig, search_config: Sear
|
|||
def test_notes_search(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser):
|
||||
# Arrange
|
||||
headers = {"Authorization": "Bearer kk-secret"}
|
||||
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user)
|
||||
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
|
||||
user_query = quote("How to git install application?")
|
||||
|
||||
# Act
|
||||
|
@ -276,7 +276,7 @@ def test_notes_search_with_only_filters(
|
|||
# Arrange
|
||||
headers = {"Authorization": "Bearer kk-secret"}
|
||||
text_search.setup(
|
||||
OrgToJsonl,
|
||||
OrgToEntries,
|
||||
sample_org_data,
|
||||
regenerate=False,
|
||||
user=default_user,
|
||||
|
@ -298,7 +298,7 @@ def test_notes_search_with_only_filters(
|
|||
def test_notes_search_with_include_filter(client, sample_org_data, default_user: KhojUser):
|
||||
# Arrange
|
||||
headers = {"Authorization": "Bearer kk-secret"}
|
||||
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user)
|
||||
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
|
||||
user_query = quote('How to git install application? +"Emacs"')
|
||||
|
||||
# Act
|
||||
|
@ -317,7 +317,7 @@ def test_notes_search_with_exclude_filter(client, sample_org_data, default_user:
|
|||
# Arrange
|
||||
headers = {"Authorization": "Bearer kk-secret"}
|
||||
text_search.setup(
|
||||
OrgToJsonl,
|
||||
OrgToEntries,
|
||||
sample_org_data,
|
||||
regenerate=False,
|
||||
user=default_user,
|
||||
|
@ -339,7 +339,7 @@ def test_notes_search_with_exclude_filter(client, sample_org_data, default_user:
|
|||
def test_different_user_data_not_accessed(client, sample_org_data, default_user: KhojUser):
|
||||
# Arrange
|
||||
headers = {"Authorization": "Bearer kk-token"} # Token for default_user2
|
||||
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user)
|
||||
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
|
||||
user_query = quote("How to git install application?")
|
||||
|
||||
# Act
|
||||
|
|
|
@ -4,7 +4,7 @@ from pathlib import Path
|
|||
import os
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries
|
||||
from khoj.utils.fs_syncer import get_markdown_files
|
||||
from khoj.utils.rawconfig import TextContentConfig
|
||||
|
||||
|
@ -23,11 +23,11 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
|
|||
|
||||
# Act
|
||||
# Extract Entries from specified Markdown files
|
||||
entry_nodes, file_to_entries = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
|
||||
entry_nodes, file_to_entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
|
||||
MarkdownToJsonl.convert_markdown_entries_to_maps(entry_nodes, file_to_entries)
|
||||
jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(
|
||||
MarkdownToEntries.convert_markdown_entries_to_maps(entry_nodes, file_to_entries)
|
||||
)
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
|
@ -52,11 +52,11 @@ def test_single_markdown_entry_to_jsonl(tmp_path):
|
|||
|
||||
# Act
|
||||
# Extract Entries from specified Markdown files
|
||||
entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
|
||||
entries, entry_to_file_map = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
|
||||
MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map)
|
||||
jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(
|
||||
MarkdownToEntries.convert_markdown_entries_to_maps(entries, entry_to_file_map)
|
||||
)
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
|
@ -81,11 +81,11 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path):
|
|||
|
||||
# Act
|
||||
# Extract Entries from specified Markdown files
|
||||
entry_strings, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
|
||||
entries = MarkdownToJsonl.convert_markdown_entries_to_maps(entry_strings, entry_to_file_map)
|
||||
entry_strings, entry_to_file_map = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
|
||||
entries = MarkdownToEntries.convert_markdown_entries_to_maps(entry_strings, entry_to_file_map)
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
|
||||
jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(entries)
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
# Assert
|
||||
|
@ -144,7 +144,7 @@ def test_extract_entries_with_different_level_headings(tmp_path):
|
|||
|
||||
# Act
|
||||
# Extract Entries from specified Markdown files
|
||||
entries, _ = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
|
||||
entries, _ = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 2
|
||||
|
|
|
@ -3,8 +3,8 @@ import json
|
|||
import os
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
||||
from khoj.processor.org_mode.org_to_entries import OrgToEntries
|
||||
from khoj.processor.text_to_entries import TextToEntries
|
||||
from khoj.utils.helpers import is_none_or_empty
|
||||
from khoj.utils.rawconfig import Entry
|
||||
from khoj.utils.fs_syncer import get_org_files
|
||||
|
@ -29,9 +29,9 @@ def test_configure_heading_entry_to_jsonl(tmp_path):
|
|||
for index_heading_entries in [True, False]:
|
||||
# Act
|
||||
# Extract entries into jsonl from specified Org files
|
||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
|
||||
OrgToJsonl.convert_org_nodes_to_entries(
|
||||
*OrgToJsonl.extract_org_entries(org_files=data), index_heading_entries=index_heading_entries
|
||||
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(
|
||||
OrgToEntries.convert_org_nodes_to_entries(
|
||||
*OrgToEntries.extract_org_entries(org_files=data), index_heading_entries=index_heading_entries
|
||||
)
|
||||
)
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
@ -59,12 +59,12 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
|
|||
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=data)
|
||||
entries, entry_to_file_map = OrgToEntries.extract_org_entries(org_files=data)
|
||||
|
||||
# Split each entry from specified Org files by max words
|
||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
|
||||
TextEmbeddings.split_entries_by_max_tokens(
|
||||
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4
|
||||
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(
|
||||
TextToEntries.split_entries_by_max_tokens(
|
||||
OrgToEntries.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4
|
||||
)
|
||||
)
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
@ -86,7 +86,7 @@ def test_entry_split_drops_large_words():
|
|||
|
||||
# Act
|
||||
# Split entry by max words and drop words larger than max word length
|
||||
processed_entry = TextEmbeddings.split_entries_by_max_tokens([entry], max_word_length=5)[0]
|
||||
processed_entry = TextToEntries.split_entries_by_max_tokens([entry], max_word_length=5)[0]
|
||||
|
||||
# Assert
|
||||
# "Heading" dropped from compiled version because its over the set max word limit
|
||||
|
@ -109,11 +109,11 @@ def test_entry_with_body_to_jsonl(tmp_path):
|
|||
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=data)
|
||||
entries, entry_to_file_map = OrgToEntries.extract_org_entries(org_files=data)
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
|
||||
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map)
|
||||
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(
|
||||
OrgToEntries.convert_org_nodes_to_entries(entries, entry_to_file_map)
|
||||
)
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
|
@ -136,11 +136,11 @@ Intro text
|
|||
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=data)
|
||||
entry_nodes, file_to_entries = OrgToEntries.extract_org_entries(org_files=data)
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries)
|
||||
entries = OrgToEntries.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
||||
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries)
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
# Assert
|
||||
|
@ -160,11 +160,11 @@ def test_file_with_no_headings_to_jsonl(tmp_path):
|
|||
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=data)
|
||||
entry_nodes, file_to_entries = OrgToEntries.extract_org_entries(org_files=data)
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries)
|
||||
entries = OrgToEntries.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
||||
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries)
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
# Assert
|
||||
|
@ -224,7 +224,7 @@ def test_extract_entries_with_different_level_headings(tmp_path):
|
|||
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
entries, _ = OrgToJsonl.extract_org_entries(org_files=data)
|
||||
entries, _ = OrgToEntries.extract_org_entries(org_files=data)
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 2
|
||||
|
|
|
@ -3,7 +3,7 @@ import json
|
|||
import os
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
||||
from khoj.processor.pdf.pdf_to_entries import PdfToEntries
|
||||
|
||||
from khoj.utils.fs_syncer import get_pdf_files
|
||||
from khoj.utils.rawconfig import TextContentConfig
|
||||
|
@ -18,11 +18,11 @@ def test_single_page_pdf_to_jsonl():
|
|||
pdf_bytes = f.read()
|
||||
|
||||
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
|
||||
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
|
||||
entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data)
|
||||
|
||||
# Process Each Entry from All Pdf Files
|
||||
jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl(
|
||||
PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map)
|
||||
jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(
|
||||
PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map)
|
||||
)
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
|
@ -38,11 +38,11 @@ def test_multi_page_pdf_to_jsonl():
|
|||
pdf_bytes = f.read()
|
||||
|
||||
data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
|
||||
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
|
||||
entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data)
|
||||
|
||||
# Process Each Entry from All Pdf Files
|
||||
jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl(
|
||||
PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map)
|
||||
jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(
|
||||
PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map)
|
||||
)
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ from pathlib import Path
|
|||
# Internal Packages
|
||||
from khoj.utils.fs_syncer import get_plaintext_files
|
||||
from khoj.utils.rawconfig import TextContentConfig
|
||||
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
|
||||
from khoj.processor.plaintext.plaintext_to_entries import PlaintextToEntries
|
||||
from database.models import LocalPlaintextConfig, KhojUser
|
||||
|
||||
|
||||
|
@ -27,14 +27,14 @@ def test_plaintext_file(tmp_path):
|
|||
f"{plaintextfile}": entry,
|
||||
}
|
||||
|
||||
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(entry_to_file_map=data)
|
||||
maps = PlaintextToEntries.convert_plaintext_entries_to_maps(entry_to_file_map=data)
|
||||
|
||||
# Convert each entry.file to absolute path to make them JSON serializable
|
||||
for map in maps:
|
||||
map.file = str(Path(map.file).absolute())
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
jsonl_string = PlaintextToJsonl.convert_entries_to_jsonl(maps)
|
||||
jsonl_string = PlaintextToEntries.convert_entries_to_jsonl(maps)
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
# Assert
|
||||
|
@ -100,7 +100,7 @@ def test_parse_html_plaintext_file(content_config, default_user: KhojUser):
|
|||
extracted_plaintext_files = get_plaintext_files(config=config)
|
||||
|
||||
# Act
|
||||
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(extracted_plaintext_files)
|
||||
maps = PlaintextToEntries.convert_plaintext_entries_to_maps(extracted_plaintext_files)
|
||||
|
||||
# Assert
|
||||
assert len(maps) == 1
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
# System Packages
|
||||
import logging
|
||||
import locale
|
||||
from pathlib import Path
|
||||
import os
|
||||
import asyncio
|
||||
|
@ -11,10 +10,10 @@ import pytest
|
|||
# Internal Packages
|
||||
from khoj.search_type import text_search
|
||||
from khoj.utils.rawconfig import ContentConfig, SearchConfig
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from khoj.processor.github.github_to_jsonl import GithubToJsonl
|
||||
from khoj.processor.org_mode.org_to_entries import OrgToEntries
|
||||
from khoj.processor.github.github_to_entries import GithubToEntries
|
||||
from khoj.utils.fs_syncer import collect_files, get_org_files
|
||||
from database.models import LocalOrgConfig, KhojUser, Embeddings, GithubConfig
|
||||
from database.models import LocalOrgConfig, KhojUser, Entry, GithubConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -66,7 +65,7 @@ def test_text_search_setup_with_empty_file_raises_error(
|
|||
# Act
|
||||
# Generate notes embeddings during asymmetric setup
|
||||
with caplog.at_level(logging.INFO):
|
||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
||||
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||
|
||||
assert "Created 0 new embeddings. Deleted 3 embeddings for user " in caplog.records[-1].message
|
||||
verify_embeddings(0, default_user)
|
||||
|
@ -81,7 +80,7 @@ def test_text_indexer_deletes_embedding_before_regenerate(
|
|||
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
|
||||
data = get_org_files(org_config)
|
||||
with caplog.at_level(logging.DEBUG):
|
||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
||||
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||
|
||||
# Assert
|
||||
assert "Deleting all embeddings for file type org" in caplog.text
|
||||
|
@ -95,7 +94,7 @@ def test_text_search_setup_batch_processes(content_config: ContentConfig, defaul
|
|||
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
|
||||
data = get_org_files(org_config)
|
||||
with caplog.at_level(logging.DEBUG):
|
||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
||||
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||
|
||||
# Assert
|
||||
assert "Created 4 new embeddings" in caplog.text
|
||||
|
@ -113,13 +112,13 @@ def test_text_index_same_if_content_unchanged(content_config: ContentConfig, def
|
|||
# Act
|
||||
# Generate initial notes embeddings during asymmetric setup
|
||||
with caplog.at_level(logging.DEBUG):
|
||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
||||
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||
initial_logs = caplog.text
|
||||
caplog.clear() # Clear logs
|
||||
|
||||
# Run asymmetric setup again with no changes to data source. Ensure index is not updated
|
||||
with caplog.at_level(logging.DEBUG):
|
||||
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user)
|
||||
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
|
||||
final_logs = caplog.text
|
||||
|
||||
# Assert
|
||||
|
@ -149,7 +148,7 @@ async def test_text_search(search_config: SearchConfig):
|
|||
await loop.run_in_executor(
|
||||
None,
|
||||
text_search.setup,
|
||||
OrgToJsonl,
|
||||
OrgToEntries,
|
||||
data,
|
||||
True,
|
||||
True,
|
||||
|
@ -186,7 +185,7 @@ def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: LocalOrgCon
|
|||
# Act
|
||||
# reload embeddings, entries, notes model after adding new org-mode file
|
||||
with caplog.at_level(logging.INFO):
|
||||
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user)
|
||||
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
|
||||
|
||||
# Assert
|
||||
# verify newly added org-mode entry is split by max tokens
|
||||
|
@ -219,7 +218,7 @@ conda activate khoj
|
|||
#+end_src"""
|
||||
}
|
||||
text_search.setup(
|
||||
OrgToJsonl,
|
||||
OrgToEntries,
|
||||
data,
|
||||
regenerate=False,
|
||||
user=default_user,
|
||||
|
@ -238,7 +237,7 @@ conda activate khoj
|
|||
# reload embeddings, entries, notes model after adding new org-mode file
|
||||
with caplog.at_level(logging.INFO):
|
||||
text_search.setup(
|
||||
OrgToJsonl,
|
||||
OrgToEntries,
|
||||
data,
|
||||
regenerate=False,
|
||||
full_corpus=False,
|
||||
|
@ -260,7 +259,7 @@ def test_regenerate_index_with_new_entry(
|
|||
data = get_org_files(org_config)
|
||||
|
||||
with caplog.at_level(logging.INFO):
|
||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
||||
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||
|
||||
assert "Created 10 new embeddings. Deleted 3 embeddings for user " in caplog.records[-1].message
|
||||
|
||||
|
@ -274,7 +273,7 @@ def test_regenerate_index_with_new_entry(
|
|||
# Act
|
||||
# regenerate notes jsonl, model embeddings and model to include entry from new file
|
||||
with caplog.at_level(logging.INFO):
|
||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
||||
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||
|
||||
# Assert
|
||||
assert "Created 11 new embeddings. Deleted 10 embeddings for user " in caplog.records[-1].message
|
||||
|
@ -299,7 +298,7 @@ def test_update_index_with_duplicate_entries_in_stable_order(
|
|||
# Act
|
||||
# generate embeddings, entries, notes model from scratch after adding new org-mode file
|
||||
with caplog.at_level(logging.INFO):
|
||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
||||
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||
initial_logs = caplog.text
|
||||
caplog.clear() # Clear logs
|
||||
|
||||
|
@ -307,7 +306,7 @@ def test_update_index_with_duplicate_entries_in_stable_order(
|
|||
|
||||
# update embeddings, entries, notes model with no new changes
|
||||
with caplog.at_level(logging.INFO):
|
||||
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user)
|
||||
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
|
||||
final_logs = caplog.text
|
||||
|
||||
# Assert
|
||||
|
@ -332,7 +331,7 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrg
|
|||
|
||||
# load embeddings, entries, notes model after adding new org file with 2 entries
|
||||
with caplog.at_level(logging.INFO):
|
||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
||||
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||
initial_logs = caplog.text
|
||||
caplog.clear() # Clear logs
|
||||
|
||||
|
@ -344,7 +343,7 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrg
|
|||
|
||||
# Act
|
||||
with caplog.at_level(logging.INFO):
|
||||
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user)
|
||||
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
|
||||
final_logs = caplog.text
|
||||
|
||||
# Assert
|
||||
|
@ -362,7 +361,7 @@ def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file
|
|||
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
|
||||
data = get_org_files(org_config)
|
||||
with caplog.at_level(logging.INFO):
|
||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
||||
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||
initial_logs = caplog.text
|
||||
caplog.clear() # Clear logs
|
||||
|
||||
|
@ -376,7 +375,7 @@ def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file
|
|||
# Act
|
||||
# update embeddings, entries with the newly added note
|
||||
with caplog.at_level(logging.INFO):
|
||||
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user)
|
||||
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
|
||||
final_logs = caplog.text
|
||||
|
||||
# Assert
|
||||
|
@ -394,7 +393,7 @@ def test_text_search_setup_github(content_config: ContentConfig, default_user: K
|
|||
# Act
|
||||
# Regenerate github embeddings to test asymmetric setup without caching
|
||||
text_search.setup(
|
||||
GithubToJsonl,
|
||||
GithubToEntries,
|
||||
{},
|
||||
regenerate=True,
|
||||
user=default_user,
|
||||
|
@ -402,10 +401,10 @@ def test_text_search_setup_github(content_config: ContentConfig, default_user: K
|
|||
)
|
||||
|
||||
# Assert
|
||||
embeddings = Embeddings.objects.filter(user=default_user, file_type="github").count()
|
||||
embeddings = Entry.objects.filter(user=default_user, file_type="github").count()
|
||||
assert embeddings > 1
|
||||
|
||||
|
||||
def verify_embeddings(expected_count, user):
|
||||
embeddings = Embeddings.objects.filter(user=user, file_type="org").count()
|
||||
embeddings = Entry.objects.filter(user=user, file_type="org").count()
|
||||
assert embeddings == expected_count
|
||||
|
|
Loading…
Add table
Reference in a new issue