[Multi-User Part 7]: Improve Sign-In UX & Rename DB Models for Readability (#528)

###  New
- Create profile pic drop-down menu in navigation pane
  Put settings page, logout action under drop-down menu

### ⚙️ Fix
- Add Key icon for API keys table on Web Client's settings page

### 🧪 Improve
- Rename `TextEmbeddings` to `TextEntries` for improved readability
- Rename `Db.Models` `Embeddings`, `EmbeddingsAdapter` to `Entry`, `EntryAdapter`
- Show truncated API key for identification & restrict table width for config page responsiveness
This commit is contained in:
Debanjum 2023-11-01 18:05:20 -07:00 committed by GitHub
commit 0fb81189ca
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
30 changed files with 498 additions and 326 deletions

View file

@ -27,7 +27,7 @@ from database.models import (
KhojApiUser,
NotionConfig,
GithubConfig,
Embeddings,
Entry,
GithubRepoConfig,
Conversation,
ConversationProcessorConfig,
@ -286,54 +286,54 @@ class ConversationAdapters:
return await OpenAIProcessorConversationConfig.objects.filter(user=user).afirst()
class EmbeddingsAdapters:
class EntryAdapters:
word_filer = WordFilter()
file_filter = FileFilter()
date_filter = DateFilter()
@staticmethod
def does_embedding_exist(user: KhojUser, hashed_value: str) -> bool:
return Embeddings.objects.filter(user=user, hashed_value=hashed_value).exists()
def does_entry_exist(user: KhojUser, hashed_value: str) -> bool:
return Entry.objects.filter(user=user, hashed_value=hashed_value).exists()
@staticmethod
def delete_embedding_by_file(user: KhojUser, file_path: str):
deleted_count, _ = Embeddings.objects.filter(user=user, file_path=file_path).delete()
def delete_entry_by_file(user: KhojUser, file_path: str):
deleted_count, _ = Entry.objects.filter(user=user, file_path=file_path).delete()
return deleted_count
@staticmethod
def delete_all_embeddings(user: KhojUser, file_type: str):
deleted_count, _ = Embeddings.objects.filter(user=user, file_type=file_type).delete()
def delete_all_entries(user: KhojUser, file_type: str):
deleted_count, _ = Entry.objects.filter(user=user, file_type=file_type).delete()
return deleted_count
@staticmethod
def get_existing_entry_hashes_by_file(user: KhojUser, file_path: str):
return Embeddings.objects.filter(user=user, file_path=file_path).values_list("hashed_value", flat=True)
return Entry.objects.filter(user=user, file_path=file_path).values_list("hashed_value", flat=True)
@staticmethod
def delete_embedding_by_hash(user: KhojUser, hashed_values: List[str]):
Embeddings.objects.filter(user=user, hashed_value__in=hashed_values).delete()
def delete_entry_by_hash(user: KhojUser, hashed_values: List[str]):
Entry.objects.filter(user=user, hashed_value__in=hashed_values).delete()
@staticmethod
def get_embeddings_by_date_filter(embeddings: BaseManager[Embeddings], start_date: date, end_date: date):
return embeddings.filter(
embeddingsdates__date__gte=start_date,
embeddingsdates__date__lte=end_date,
def get_entries_by_date_filter(entry: BaseManager[Entry], start_date: date, end_date: date):
return entry.filter(
entrydates__date__gte=start_date,
entrydates__date__lte=end_date,
)
@staticmethod
async def user_has_embeddings(user: KhojUser):
return await Embeddings.objects.filter(user=user).aexists()
async def user_has_entries(user: KhojUser):
return await Entry.objects.filter(user=user).aexists()
@staticmethod
def apply_filters(user: KhojUser, query: str, file_type_filter: str = None):
q_filter_terms = Q()
explicit_word_terms = EmbeddingsAdapters.word_filer.get_filter_terms(query)
file_filters = EmbeddingsAdapters.file_filter.get_filter_terms(query)
date_filters = EmbeddingsAdapters.date_filter.get_query_date_range(query)
explicit_word_terms = EntryAdapters.word_filer.get_filter_terms(query)
file_filters = EntryAdapters.file_filter.get_filter_terms(query)
date_filters = EntryAdapters.date_filter.get_query_date_range(query)
if len(explicit_word_terms) == 0 and len(file_filters) == 0 and len(date_filters) == 0:
return Embeddings.objects.filter(user=user)
return Entry.objects.filter(user=user)
for term in explicit_word_terms:
if term.startswith("+"):
@ -354,32 +354,32 @@ class EmbeddingsAdapters:
if min_date is not None:
# Convert the min_date timestamp to yyyy-mm-dd format
formatted_min_date = date.fromtimestamp(min_date).strftime("%Y-%m-%d")
q_filter_terms &= Q(embeddings_dates__date__gte=formatted_min_date)
q_filter_terms &= Q(entry_dates__date__gte=formatted_min_date)
if max_date is not None:
# Convert the max_date timestamp to yyyy-mm-dd format
formatted_max_date = date.fromtimestamp(max_date).strftime("%Y-%m-%d")
q_filter_terms &= Q(embeddings_dates__date__lte=formatted_max_date)
q_filter_terms &= Q(entry_dates__date__lte=formatted_max_date)
relevant_embeddings = Embeddings.objects.filter(user=user).filter(
relevant_entries = Entry.objects.filter(user=user).filter(
q_filter_terms,
)
if file_type_filter:
relevant_embeddings = relevant_embeddings.filter(file_type=file_type_filter)
return relevant_embeddings
relevant_entries = relevant_entries.filter(file_type=file_type_filter)
return relevant_entries
@staticmethod
def search_with_embeddings(
user: KhojUser, embeddings: Tensor, max_results: int = 10, file_type_filter: str = None, raw_query: str = None
):
relevant_embeddings = EmbeddingsAdapters.apply_filters(user, raw_query, file_type_filter)
relevant_embeddings = relevant_embeddings.filter(user=user).annotate(
relevant_entries = EntryAdapters.apply_filters(user, raw_query, file_type_filter)
relevant_entries = relevant_entries.filter(user=user).annotate(
distance=CosineDistance("embeddings", embeddings)
)
if file_type_filter:
relevant_embeddings = relevant_embeddings.filter(file_type=file_type_filter)
relevant_embeddings = relevant_embeddings.order_by("distance")
return relevant_embeddings[:max_results]
relevant_entries = relevant_entries.filter(file_type=file_type_filter)
relevant_entries = relevant_entries.order_by("distance")
return relevant_entries[:max_results]
@staticmethod
def get_unique_file_types(user: KhojUser):
return Embeddings.objects.filter(user=user).values_list("file_type", flat=True).distinct()
return Entry.objects.filter(user=user).values_list("file_type", flat=True).distinct()

View file

@ -0,0 +1,30 @@
# Generated by Django 4.2.5 on 2023-10-26 23:52
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("database", "0009_khojapiuser"),
]
operations = [
migrations.RenameModel(
old_name="Embeddings",
new_name="Entry",
),
migrations.RenameModel(
old_name="EmbeddingsDates",
new_name="EntryDates",
),
migrations.RenameField(
model_name="entrydates",
old_name="embeddings",
new_name="entry",
),
migrations.RenameIndex(
model_name="entrydates",
new_name="database_en_date_8d823c_idx",
old_name="database_em_date_a1ba47_idx",
),
]

View file

@ -114,8 +114,8 @@ class Conversation(BaseModel):
conversation_log = models.JSONField(default=dict)
class Embeddings(BaseModel):
class EmbeddingsType(models.TextChoices):
class Entry(BaseModel):
class EntryType(models.TextChoices):
IMAGE = "image"
PDF = "pdf"
PLAINTEXT = "plaintext"
@ -130,7 +130,7 @@ class Embeddings(BaseModel):
raw = models.TextField()
compiled = models.TextField()
heading = models.CharField(max_length=1000, default=None, null=True, blank=True)
file_type = models.CharField(max_length=30, choices=EmbeddingsType.choices, default=EmbeddingsType.PLAINTEXT)
file_type = models.CharField(max_length=30, choices=EntryType.choices, default=EntryType.PLAINTEXT)
file_path = models.CharField(max_length=400, default=None, null=True, blank=True)
file_name = models.CharField(max_length=400, default=None, null=True, blank=True)
url = models.URLField(max_length=400, default=None, null=True, blank=True)
@ -138,9 +138,9 @@ class Embeddings(BaseModel):
corpus_id = models.UUIDField(default=uuid.uuid4, editable=False)
class EmbeddingsDates(BaseModel):
class EntryDates(BaseModel):
date = models.DateField()
embeddings = models.ForeignKey(Embeddings, on_delete=models.CASCADE, related_name="embeddings_dates")
entry = models.ForeignKey(Entry, on_delete=models.CASCADE, related_name="embeddings_dates")
class Meta:
indexes = [

View file

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<svg width="800px" height="800px" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg">
<path fill-rule="evenodd" clip-rule="evenodd" d="M22 8.29344C22 11.7692 19.1708 14.5869 15.6807 14.5869C15.0439 14.5869 13.5939 14.4405 12.8885 13.8551L12.0067 14.7333C11.4883 15.2496 11.6283 15.4016 11.8589 15.652C11.9551 15.7565 12.0672 15.8781 12.1537 16.0505C12.1537 16.0505 12.8885 17.075 12.1537 18.0995C11.7128 18.6849 10.4783 19.5045 9.06754 18.0995L8.77362 18.3922C8.77362 18.3922 9.65538 19.4167 8.92058 20.4412C8.4797 21.0267 7.30403 21.6121 6.27531 20.5876L5.2466 21.6121C4.54119 22.3146 3.67905 21.9048 3.33616 21.6121L2.45441 20.7339C1.63143 19.9143 2.1115 19.0264 2.45441 18.6849L10.0963 11.0743C10.0963 11.0743 9.3615 9.90338 9.3615 8.29344C9.3615 4.81767 12.1907 2 15.6807 2C19.1708 2 22 4.81767 22 8.29344ZM15.681 10.4889C16.8984 10.4889 17.8853 9.50601 17.8853 8.29353C17.8853 7.08105 16.8984 6.09814 15.681 6.09814C14.4635 6.09814 13.4766 7.08105 13.4766 8.29353C13.4766 9.50601 14.4635 10.4889 15.681 10.4889Z" fill="#1C274C"/>
</svg>

After

Width:  |  Height:  |  Size: 1.1 KiB

View file

@ -6,6 +6,8 @@
--primary-hover: #ffa000;
--primary-focus: rgba(255, 179, 0, 0.125);
--primary-inverse: rgba(0, 0, 0, 0.75);
--background-color: #fff;
--main-text-color: #475569;
}
/* Amber Dark scheme (Auto) */
@ -16,6 +18,8 @@
--primary-hover: #ffc107;
--primary-focus: rgba(255, 179, 0, 0.25);
--primary-inverse: rgba(0, 0, 0, 0.75);
--background-color: #fff;
--main-text-color: #475569;
}
}
/* Amber Dark scheme (Forced) */
@ -25,6 +29,8 @@
--primary-hover: #ffc107;
--primary-focus: rgba(255, 179, 0, 0.25);
--primary-inverse: rgba(0, 0, 0, 0.75);
--background-color: #fff;
--main-text-color: #475569;
}
/* Amber (Common styles) */
:root {
@ -37,7 +43,8 @@
.khoj-configure {
display: grid;
grid-template-columns: 1fr;
padding: 0 24px;
font-family: roboto, karma, segoe ui, sans-serif;
font-weight: 300;
}
.khoj-header {
display: grid;
@ -100,7 +107,84 @@ p#khoj-banner {
display: inline;
}
@media only screen and (max-width: 600px) {
/* Dropdown in navigation menu*/
#khoj-nav-menu-container {
display: flex;
align-items: center;
}
.khoj-nav-dropdown-content {
display: block;
grid-auto-flow: row;
position: absolute;
background-color: var(--background-color);
min-width: 160px;
box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
right: 15vw;
top: 64px;
z-index: 1;
opacity: 0;
transition: opacity 0.1s ease-in-out;
pointer-events: none;
text-align: left;
}
.khoj-nav-dropdown-content.show {
opacity: 1;
pointer-events: auto;
}
.khoj-nav-dropdown-content a {
color: black;
padding: 12px 16px;
text-decoration: none;
display: block;
}
.khoj-nav-dropdown-content a:hover {
background-color: var(--primary-hover);
}
.khoj-nav-username {
padding: 12px 16px;
text-decoration: none;
display: block;
font-weight: bold;
}
.circle {
border-radius: 50%;
border: 2px solid var(--primary-inverse);
width: 40px;
height: 40px;
vertical-align: text-top;
padding: 3px;
cursor: pointer;
}
.circle:hover {
background-color: var(--primary-hover);
}
.user-initial {
background-color: white;
color: black;
display: grid;
justify-content: center;
align-items: center;
font-size: 20px;
box-sizing: unset;
}
@media screen and (max-width: 700px) {
.khoj-nav-dropdown-content {
display: block;
grid-auto-flow: row;
position: absolute;
background-color: var(--background-color);
min-width: 160px;
box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
right: 10px;
z-index: 1;
opacity: 0;
transition: opacity 0.1s ease-in-out;
pointer-events: none;
}
}
@media only screen and (max-width: 700px) {
div.khoj-header {
display: grid;
grid-auto-flow: column;

View file

@ -0,0 +1,15 @@
// Toggle the navigation menu
function toggleMenu() {
var menu = document.getElementById("khoj-nav-menu");
menu.classList.toggle("show");
}
// Close the dropdown menu if the user clicks outside of it
document.addEventListener('click', function(event) {
let menu = document.getElementById("khoj-nav-menu");
let menuContainer = document.getElementById("khoj-nav-menu-container");
let isClickOnMenu = menuContainer.contains(event.target) || menuContainer === event.target;
if (isClickOnMenu === false && menu.classList.contains("show")) {
menu.classList.remove("show");
}
});

View file

@ -8,19 +8,15 @@
<link rel="stylesheet" href="/static/assets/pico.min.css">
<link rel="stylesheet" href="/static/assets/khoj.css">
</head>
<script type="text/javascript" src="/static/assets/khoj.js"></script>
<body class="khoj-configure">
<div class="khoj-header-wrapper">
<div class="filler"></div>
<div class="khoj-header">
<a class="khoj-logo" href="https://khoj.dev" target="_blank">
<img class="khoj-logo" src="/static/assets/icons/khoj-logo-sideways-500.png" alt="Khoj"></img>
</a>
<nav class="khoj-nav">
<a class="khoj-nav" href="/chat">Chat</a>
<a class="khoj-nav" href="/">Search</a>
<a class="khoj-nav khoj-nav-selected" href="/config">Settings</a>
</nav>
</div>
<!--Add Header Logo and Nav Pane-->
{% import 'utils.html' as utils %}
{{ utils.heading_pane(user_photo, username) }}
<div class="filler"></div>
</div>
<div class=”content”>
@ -38,10 +34,15 @@
img.khoj-logo {
max-width: none!important;
}
div.khoj-header-wrapper{
div.khoj-header-wrapper {
display: grid;
grid-template-columns: 1fr min(70vw, 100%) 1fr;
}
img.circle {
width: 49px;
height: 49px;
}
.page {
display: grid;
grid-auto-flow: row;
@ -233,12 +234,12 @@
height: 32px;
}
@media screen and (max-width: 600px) {
@media screen and (max-width: 700px) {
.section-cards {
grid-template-columns: 1fr;
}
}
@media only screen and (max-width: 600px) {
@media only screen and (max-width: 700px) {
body {
display: grid;
grid-template-columns: 1fr;
@ -264,10 +265,9 @@
width: 320px;
}
div.khoj-header-wrapper{
div.khoj-header-wrapper {
grid-template-columns: auto;
}
}
</style>
</html>

View file

@ -8,6 +8,7 @@
<link rel="manifest" href="/static/khoj_chat.webmanifest">
<link rel="stylesheet" href="/static/assets/khoj.css">
</head>
<script type="text/javascript" src="/static/assets/khoj.js"></script>
<script>
let chatOptions = [];
function copyProgrammaticOutput(event) {
@ -269,25 +270,10 @@
<button id="khoj-banner-submit" class="khoj-banner-button">Submit</button>
{% endif %}
</div>
<!--Add Header Logo and Nav Pane-->
<div class="khoj-header">
{% if demo %}
<a class="khoj-logo" href="https://khoj.dev" target="_blank">
<img class="khoj-logo" src="/static/assets/icons/khoj-logo-sideways-500.png" alt="Khoj"></img>
</a>
{% else %}
<a class="khoj-logo" href="/">
<img class="khoj-logo" src="/static/assets/icons/khoj-logo-sideways-500.png" alt="Khoj"></img>
</a>
{% endif %}
<nav class="khoj-nav">
<a class="khoj-nav khoj-nav-selected" href="/chat">Chat</a>
<a class="khoj-nav" href="/">Search</a>
{% if not demo %}
<a class="khoj-nav" href="/config">Settings</a>
{% endif %}
</nav>
</div>
{% import 'utils.html' as utils %}
{{ utils.heading_pane(user_photo, username) }}
<!-- Chat Body -->
<div id="chat-body"></div>
@ -309,8 +295,8 @@
}
body {
display: grid;
background: #fff;
color: #475569;
background: var(--background-color);
color: var(--main-text-color);
text-align: center;
font-family: roboto, karma, segoe ui, sans-serif;
font-size: 20px;
@ -332,7 +318,7 @@
content: attr(data-meta);
display: block;
font-size: x-small;
color: #475569;
color: var(--main-text-color);
margin: -8px 4px 0 -5px;
}
/* move message by khoj to left */
@ -402,7 +388,7 @@
top: 91%;
right: -2px;
border: 10px solid transparent;
border-left-color: #475569;
border-left-color: var(--main-text-color);
border-right: 0;
margin-top: -10px;
transform: rotate(-60deg)
@ -418,7 +404,7 @@
#chat-footer > * {
padding: 15px;
border-radius: 5px;
border: 1px solid #475569;
border: 1px solid var(--main-text-color);
background: #f9fafc
}
.option:hover {
@ -451,9 +437,9 @@
}
a.inline-chat-link {
color: #475569;
color: var(--main-text-color);
text-decoration: none;
border-bottom: 1px dotted #475569;
border-bottom: 1px dotted var(--main-text-color);
}
@media (pointer: coarse), (hover: none) {
@ -479,7 +465,7 @@
padding: 2px 4px;
}
}
@media only screen and (max-width: 600px) {
@media only screen and (max-width: 700px) {
body {
grid-template-columns: 1fr;
grid-template-rows: auto auto minmax(80px, 100%) auto;
@ -499,7 +485,7 @@
padding: 0;
}
}
@media only screen and (min-width: 600px) {
@media only screen and (min-width: 700px) {
body {
grid-template-columns: auto min(70vw, 100%) auto;
grid-template-rows: auto auto minmax(80px, 100%) auto;
@ -542,7 +528,7 @@
input#khoj-banner-email {
padding: 10px;
border-radius: 5px;
border: 1px solid #475569;
border: 1px solid var(--main-text-color);
background: #f9fafc;
}

View file

@ -3,11 +3,6 @@
<div class="page">
<div class="section">
{% if anonymous_mode == False %}
<div>
Logged in as {{ username }}
</div>
{% endif %}
<h2 class="section-title">Plugins</h2>
<div class="section-cards">
<div class="card">
@ -328,11 +323,6 @@
<div class="finalize-buttons">
<button id="reinitialize" type="submit" title="Regenerate index from scratch">🔄 Reinitialize</button>
</div>
{% if anonymous_mode == False %}
<div class="finalize-buttons">
<button id="logout" class="logout" onclick="window.location.href='/auth/logout'">Logout</button>
</div>
{% endif %}
</div>
</div>
</div>
@ -541,16 +531,7 @@
})
.then(response => response.json())
.then(tokenObj => {
apiKeyList.innerHTML += `
<tr id="api-key-item-${tokenObj.token}">
<td><b>${tokenObj.name}</b></td>
<td id="api-key-${tokenObj.token}">${tokenObj.token}</td>
<td>
<img id="api-key-copy-button-${tokenObj.token}" onclick="copyAPIKey('${tokenObj.token}')" class="configured-icon enabled" src="/static/assets/icons/copy-solid.svg" alt="Copy API Key">
<img id="api-key-delete-button-${tokenObj.token}" onclick="deleteAPIKey('${tokenObj.token}')" class="configured-icon enabled" src="/static/assets/icons/trash-solid.svg" alt="Delete API Key">
</td>
</tr>
`;
apiKeyList.innerHTML += generateTokenRow(tokenObj);
});
}
@ -561,7 +542,7 @@
const copyApiKeyButton = document.getElementById(`api-key-${token}`);
original_html = copyApiKeyButton.innerHTML
setTimeout(function() {
copyApiKeyButton.innerHTML = "✅ Copied to your clipboard!";
copyApiKeyButton.innerHTML = "✅ Copied!";
setTimeout(function() {
copyApiKeyButton.innerHTML = original_html;
}, 1000);
@ -581,23 +562,30 @@
});
}
function generateTokenRow(tokenObj) {
let token = tokenObj.token;
let tokenName = tokenObj.name;
let truncatedToken = token.slice(0, 4) + "..." + token.slice(-4);
let tokenId = `${tokenName}-${truncatedToken}`;
return `
<tr id="api-key-item-${token}">
<td><b>${tokenName}</b></td>
<td id="api-key-${token}">${truncatedToken}</td>
<td>
<img onclick="copyAPIKey('${token}')" class="configured-icon enabled" src="/static/assets/icons/copy-solid.svg" alt="Copy API Key" title="Copy API Key">
<img onclick="deleteAPIKey('${token}')" class="configured-icon enabled" src="/static/assets/icons/trash-solid.svg" alt="Delete API Key" title="Delete API Key">
</td>
</tr>
`;
}
function listApiKeys() {
const apiKeyList = document.getElementById("api-key-list");
fetch('/auth/token')
.then(response => response.json())
.then(tokens => {
apiKeyList.innerHTML = tokens.map(tokenObj =>
`
<tr id="api-key-item-${tokenObj.token}">
<td><b>${tokenObj.name}</b></td>
<td id="api-key-${tokenObj.token}">${tokenObj.token}</td>
<td>
<img id="api-key-copy-button-${tokenObj.token}" onclick="copyAPIKey('${tokenObj.token}')" class="configured-icon enabled" src="/static/assets/icons/copy-solid.svg" alt="Copy API Key">
<img id="api-key-delete-button-${tokenObj.token}" onclick="deleteAPIKey('${tokenObj.token}')" class="configured-icon enabled" src="/static/assets/icons/trash-solid.svg" alt="Delete API Key">
</td>
</tr>
`)
.join("");
apiKeyList.innerHTML = tokens.map(generateTokenRow).join("");
});
}

View file

@ -10,6 +10,7 @@
</head>
<script type="text/javascript" src="/static/assets/org.min.js"></script>
<script type="text/javascript" src="/static/assets/markdown-it.min.js"></script>
<script type="text/javascript" src="/static/assets/khoj.js"></script>
<script>
function render_image(item) {
@ -281,25 +282,10 @@
<button id="khoj-banner-submit" class="khoj-banner-button">Submit</button>
</div>
{% endif %}
<!--Add Header Logo and Nav Pane-->
<div class="khoj-header">
{% if demo %}
<a class="khoj-logo" href="https://khoj.dev" target="_blank">
<img class="khoj-logo" src="/static/assets/icons/khoj-logo-sideways-500.png" alt="Khoj"></img>
</a>
{% else %}
<a class="khoj-logo" href="/">
<img class="khoj-logo" src="/static/assets/icons/khoj-logo-sideways-500.png" alt="Khoj"></img>
</a>
{% endif %}
<nav class="khoj-nav">
<a class="khoj-nav" href="/chat">Chat</a>
<a class="khoj-nav khoj-nav-selected" href="/">Search</a>
{% if not demo %}
<a class="khoj-nav" href="/config">Settings</a>
{% endif %}
</nav>
</div>
{% import 'utils.html' as utils %}
{{ utils.heading_pane(user_photo, username) }}
<!--Add Text Box To Enter Query, Trigger Incremental Search OnChange -->
<input type="text" id="query" class="option" onkeyup=incrementalSearch(event) autofocus="autofocus" placeholder="Search your knowledge base using natural language">
@ -314,7 +300,7 @@
</body>
<style>
@media only screen and (max-width: 600px) {
@media only screen and (max-width: 700px) {
body {
display: grid;
grid-template-columns: 1fr;
@ -325,7 +311,7 @@
grid-column: 1;
}
}
@media only screen and (min-width: 600px) {
@media only screen and (min-width: 700px) {
body {
display: grid;
grid-template-columns: 1fr min(70vw, 100%) 1fr;
@ -339,8 +325,8 @@
body {
padding: 0px;
margin: 0px;
background: #fff;
color: #475569;
background: var(--background-color);
color: var(--main-text-color);
font-family: roboto, karma, segoe ui, sans-serif;
font-size: 20px;
font-weight: 300;
@ -358,7 +344,7 @@
#options > * {
padding: 15px;
border-radius: 5px;
border: 1px solid #475569;
border: 1px solid var(--main-text-color);
background: #f9fafc
}
.option:hover {
@ -386,7 +372,7 @@
.image {
width: 20vw;
border-radius: 10px;
border: 1px solid #475569;
border: 1px solid var(--main-text-color);
}
#json {
white-space: pre-wrap;
@ -429,7 +415,7 @@
padding: 3.5px 3.5px 0;
margin-right: 5px;
border-radius: 5px;
border: 1px solid #475569;
border: 1px solid var(--main-text-color);
background-color: #ef4444;
font-size: small;
}
@ -500,7 +486,7 @@
input#khoj-banner-email {
padding: 10px;
border-radius: 5px;
border: 1px solid #475569;
border: 1px solid var(--main-text-color);
background: #f9fafc;
}
@ -509,7 +495,7 @@
box-shadow: 0 0 11px #aaa;
}
@media only screen and (max-width: 600px) {
@media only screen and (max-width: 700px) {
a.khoj-banner {
display: block;
}

View file

@ -58,7 +58,7 @@
</body>
<style>
@media only screen and (max-width: 600px) {
@media only screen and (max-width: 700px) {
body {
display: grid;
grid-template-columns: 1fr;
@ -69,7 +69,7 @@
grid-column: 1;
}
}
@media only screen and (min-width: 600px) {
@media only screen and (min-width: 700px) {
body {
display: grid;
grid-template-columns: 1fr min(70vw, 100%) 1fr;
@ -150,7 +150,7 @@
font-size: x-large;
}
@media only screen and (max-width: 600px) {
@media only screen and (max-width: 700px) {
a.khoj-banner {
display: block;
}

View file

@ -0,0 +1,24 @@
{% macro heading_pane(user_photo, username) -%}
<div class="khoj-header">
<a class="khoj-logo" href="/" target="_blank">
<img class="khoj-logo" src="/static/assets/icons/khoj-logo-sideways-500.png" alt="Khoj"></img>
</a>
<nav class="khoj-nav">
<a class="khoj-nav" href="/chat">💬 Chat</a>
<a class="khoj-nav" href="/">🔎 Search</a>
<!-- Dropdown Menu -->
<div id="khoj-nav-menu-container" class="khoj-nav dropdown">
{% if user_photo and user_photo != "None" %}
<img class="circle" src="{{ user_photo }}" alt="{{ username[0].upper() }}" onclick="toggleMenu()" referrerpolicy="no-referrer">
{% else %}
<div class="circle user-initial" alt="{{ username[0].upper() }}" onclick="toggleMenu()">{{ username[0].upper() }}</div>
{% endif %}
<div id="khoj-nav-menu" class="khoj-nav-dropdown-content">
<div class="khoj-nav-username"> {{ username }} </div>
<a class="khoj-nav khoj-nav-selected" href="/config">⚙️ Settings</a>
<a class="khoj-nav" href="/auth/logout">🔑 Logout</a>
</div>
</div>
</nav>
</div>
{%- endmacro %}

View file

@ -10,17 +10,16 @@ import requests
# Internal Packages
from khoj.utils.helpers import timer
from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
from khoj.processor.text_to_jsonl import TextEmbeddings
from khoj.utils.rawconfig import Entry
from database.models import Embeddings, GithubConfig, KhojUser
from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries
from khoj.processor.org_mode.org_to_entries import OrgToEntries
from khoj.processor.text_to_entries import TextToEntries
from database.models import Entry as DbEntry, GithubConfig, KhojUser
logger = logging.getLogger(__name__)
class GithubToJsonl(TextEmbeddings):
class GithubToEntries(TextToEntries):
def __init__(self, config: GithubConfig):
super().__init__(config)
raw_repos = config.githubrepoconfig.all()
@ -78,24 +77,26 @@ class GithubToJsonl(TextEmbeddings):
current_entries = []
with timer(f"Extract markdown entries from github repo {repo_shorthand}", logger):
current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
*GithubToJsonl.extract_markdown_entries(markdown_files)
current_entries = MarkdownToEntries.convert_markdown_entries_to_maps(
*GithubToEntries.extract_markdown_entries(markdown_files)
)
with timer(f"Extract org entries from github repo {repo_shorthand}", logger):
current_entries += OrgToJsonl.convert_org_nodes_to_entries(*GithubToJsonl.extract_org_entries(org_files))
current_entries += OrgToEntries.convert_org_nodes_to_entries(
*GithubToEntries.extract_org_entries(org_files)
)
with timer(f"Extract commit messages from github repo {repo_shorthand}", logger):
current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo)
with timer(f"Extract issues from github repo {repo_shorthand}", logger):
issue_entries = GithubToJsonl.convert_issues_to_entries(
*GithubToJsonl.extract_github_issues(self.get_issues(repo_url))
issue_entries = GithubToEntries.convert_issues_to_entries(
*GithubToEntries.extract_github_issues(self.get_issues(repo_url))
)
current_entries += issue_entries
with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger):
current_entries = TextEmbeddings.split_entries_by_max_tokens(current_entries, max_tokens=256)
current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=256)
return current_entries
@ -103,7 +104,7 @@ class GithubToJsonl(TextEmbeddings):
# Identify, mark and merge any new entries with previous entries
with timer("Identify new or updated entries", logger):
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
current_entries, Embeddings.EmbeddingsType.GITHUB, key="compiled", logger=logger, user=user
current_entries, DbEntry.EntryType.GITHUB, key="compiled", logger=logger, user=user
)
return num_new_embeddings, num_deleted_embeddings
@ -281,7 +282,7 @@ class GithubToJsonl(TextEmbeddings):
entries = []
entry_to_file_map = []
for doc in markdown_files:
entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file(
entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file(
doc["content"], doc["path"], entries, entry_to_file_map
)
return entries, dict(entry_to_file_map)
@ -292,7 +293,7 @@ class GithubToJsonl(TextEmbeddings):
entry_to_file_map = []
for doc in org_files:
entries, entry_to_file_map = OrgToJsonl.process_single_org_file(
entries, entry_to_file_map = OrgToEntries.process_single_org_file(
doc["content"], doc["path"], entries, entry_to_file_map
)
return entries, dict(entry_to_file_map)

View file

@ -6,17 +6,17 @@ from pathlib import Path
from typing import Tuple, List
# Internal Packages
from khoj.processor.text_to_jsonl import TextEmbeddings
from khoj.processor.text_to_entries import TextToEntries
from khoj.utils.helpers import timer
from khoj.utils.constants import empty_escape_sequences
from khoj.utils.rawconfig import Entry
from database.models import Embeddings, KhojUser
from database.models import Entry as DbEntry, KhojUser
logger = logging.getLogger(__name__)
class MarkdownToJsonl(TextEmbeddings):
class MarkdownToEntries(TextToEntries):
def __init__(self):
super().__init__()
@ -34,8 +34,8 @@ class MarkdownToJsonl(TextEmbeddings):
# Extract Entries from specified Markdown files
with timer("Parse entries from Markdown files into dictionaries", logger):
current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
*MarkdownToJsonl.extract_markdown_entries(files)
current_entries = MarkdownToEntries.convert_markdown_entries_to_maps(
*MarkdownToEntries.extract_markdown_entries(files)
)
# Split entries by max tokens supported by model
@ -46,7 +46,7 @@ class MarkdownToJsonl(TextEmbeddings):
with timer("Identify new or updated entries", logger):
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
current_entries,
Embeddings.EmbeddingsType.MARKDOWN,
DbEntry.EntryType.MARKDOWN,
"compiled",
logger,
deletion_file_names,
@ -67,7 +67,7 @@ class MarkdownToJsonl(TextEmbeddings):
for markdown_file in markdown_files:
try:
markdown_content = markdown_files[markdown_file]
entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file(
entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file(
markdown_content, markdown_file, entries, entry_to_file_map
)
except Exception as e:

View file

@ -8,9 +8,9 @@ import requests
# Internal Packages
from khoj.utils.helpers import timer
from khoj.utils.rawconfig import Entry, NotionContentConfig
from khoj.processor.text_to_jsonl import TextEmbeddings
from khoj.processor.text_to_entries import TextToEntries
from khoj.utils.rawconfig import Entry
from database.models import Embeddings, KhojUser, NotionConfig
from database.models import Entry as DbEntry, KhojUser, NotionConfig
from enum import Enum
@ -50,7 +50,7 @@ class NotionBlockType(Enum):
CALLOUT = "callout"
class NotionToJsonl(TextEmbeddings):
class NotionToEntries(TextToEntries):
def __init__(self, config: NotionConfig):
super().__init__(config)
self.config = NotionContentConfig(
@ -250,7 +250,7 @@ class NotionToJsonl(TextEmbeddings):
# Identify, mark and merge any new entries with previous entries
with timer("Identify new or updated entries", logger):
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
current_entries, Embeddings.EmbeddingsType.NOTION, key="compiled", logger=logger, user=user
current_entries, DbEntry.EntryType.NOTION, key="compiled", logger=logger, user=user
)
return num_new_embeddings, num_deleted_embeddings

View file

@ -5,17 +5,17 @@ from typing import Iterable, List, Tuple
# Internal Packages
from khoj.processor.org_mode import orgnode
from khoj.processor.text_to_jsonl import TextEmbeddings
from khoj.processor.text_to_entries import TextToEntries
from khoj.utils.helpers import timer
from khoj.utils.rawconfig import Entry
from khoj.utils import state
from database.models import Embeddings, KhojUser
from database.models import Entry as DbEntry, KhojUser
logger = logging.getLogger(__name__)
class OrgToJsonl(TextEmbeddings):
class OrgToEntries(TextToEntries):
def __init__(self):
super().__init__()
@ -47,7 +47,7 @@ class OrgToJsonl(TextEmbeddings):
with timer("Identify new or updated entries", logger):
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
current_entries,
Embeddings.EmbeddingsType.ORG,
DbEntry.EntryType.ORG,
"compiled",
logger,
deletion_file_names,

View file

@ -8,16 +8,16 @@ import base64
from langchain.document_loaders import PyMuPDFLoader
# Internal Packages
from khoj.processor.text_to_jsonl import TextEmbeddings
from khoj.processor.text_to_entries import TextToEntries
from khoj.utils.helpers import timer
from khoj.utils.rawconfig import Entry
from database.models import Embeddings, KhojUser
from database.models import Entry as DbEntry, KhojUser
logger = logging.getLogger(__name__)
class PdfToJsonl(TextEmbeddings):
class PdfToEntries(TextToEntries):
def __init__(self):
super().__init__()
@ -35,7 +35,7 @@ class PdfToJsonl(TextEmbeddings):
# Extract Entries from specified Pdf files
with timer("Parse entries from PDF files into dictionaries", logger):
current_entries = PdfToJsonl.convert_pdf_entries_to_maps(*PdfToJsonl.extract_pdf_entries(files))
current_entries = PdfToEntries.convert_pdf_entries_to_maps(*PdfToEntries.extract_pdf_entries(files))
# Split entries by max tokens supported by model
with timer("Split entries by max token size supported by model", logger):
@ -45,7 +45,7 @@ class PdfToJsonl(TextEmbeddings):
with timer("Identify new or updated entries", logger):
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
current_entries,
Embeddings.EmbeddingsType.PDF,
DbEntry.EntryType.PDF,
"compiled",
logger,
deletion_file_names,

View file

@ -6,16 +6,16 @@ from bs4 import BeautifulSoup
# Internal Packages
from khoj.processor.text_to_jsonl import TextEmbeddings
from khoj.processor.text_to_entries import TextToEntries
from khoj.utils.helpers import timer
from khoj.utils.rawconfig import Entry
from database.models import Embeddings, KhojUser
from database.models import Entry as DbEntry, KhojUser
logger = logging.getLogger(__name__)
class PlaintextToJsonl(TextEmbeddings):
class PlaintextToEntries(TextToEntries):
def __init__(self):
super().__init__()
@ -35,7 +35,7 @@ class PlaintextToJsonl(TextEmbeddings):
try:
plaintext_content = files[file]
if file.endswith(("html", "htm", "xml")):
plaintext_content = PlaintextToJsonl.extract_html_content(
plaintext_content = PlaintextToEntries.extract_html_content(
plaintext_content, file.split(".")[-1]
)
files[file] = plaintext_content
@ -45,7 +45,7 @@ class PlaintextToJsonl(TextEmbeddings):
# Extract Entries from specified plaintext files
with timer("Parse entries from plaintext files", logger):
current_entries = PlaintextToJsonl.convert_plaintext_entries_to_maps(files)
current_entries = PlaintextToEntries.convert_plaintext_entries_to_maps(files)
# Split entries by max tokens supported by model
with timer("Split entries by max token size supported by model", logger):
@ -55,7 +55,7 @@ class PlaintextToJsonl(TextEmbeddings):
with timer("Identify new or updated entries", logger):
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
current_entries,
Embeddings.EmbeddingsType.PLAINTEXT,
DbEntry.EntryType.PLAINTEXT,
key="compiled",
logger=logger,
deletion_filenames=deletion_file_names,

View file

@ -12,14 +12,14 @@ from khoj.utils.helpers import timer, batcher
from khoj.utils.rawconfig import Entry
from khoj.processor.embeddings import EmbeddingsModel
from khoj.search_filter.date_filter import DateFilter
from database.models import KhojUser, Embeddings, EmbeddingsDates
from database.adapters import EmbeddingsAdapters
from database.models import KhojUser, Entry as DbEntry, EntryDates
from database.adapters import EntryAdapters
logger = logging.getLogger(__name__)
class TextEmbeddings(ABC):
class TextToEntries(ABC):
def __init__(self, config: Any = None):
self.embeddings_model = EmbeddingsModel()
self.config = config
@ -85,23 +85,23 @@ class TextEmbeddings(ABC):
):
with timer("Construct current entry hashes", logger):
hashes_by_file = dict[str, set[str]]()
current_entry_hashes = list(map(TextEmbeddings.hash_func(key), current_entries))
current_entry_hashes = list(map(TextToEntries.hash_func(key), current_entries))
hash_to_current_entries = dict(zip(current_entry_hashes, current_entries))
for entry in tqdm(current_entries, desc="Hashing Entries"):
hashes_by_file.setdefault(entry.file, set()).add(TextEmbeddings.hash_func(key)(entry))
hashes_by_file.setdefault(entry.file, set()).add(TextToEntries.hash_func(key)(entry))
num_deleted_embeddings = 0
with timer("Preparing dataset for regeneration", logger):
if regenerate:
logger.debug(f"Deleting all embeddings for file type {file_type}")
num_deleted_embeddings = EmbeddingsAdapters.delete_all_embeddings(user, file_type)
num_deleted_embeddings = EntryAdapters.delete_all_entries(user, file_type)
num_new_embeddings = 0
with timer("Identify hashes for adding new entries", logger):
for file in tqdm(hashes_by_file, desc="Processing file with hashed values"):
hashes_for_file = hashes_by_file[file]
hashes_to_process = set()
existing_entries = Embeddings.objects.filter(
existing_entries = DbEntry.objects.filter(
user=user, hashed_value__in=hashes_for_file, file_type=file_type
)
existing_entry_hashes = set([entry.hashed_value for entry in existing_entries])
@ -124,7 +124,7 @@ class TextEmbeddings(ABC):
for entry_hash, embedding in entry_batch:
entry = hash_to_current_entries[entry_hash]
batch_embeddings_to_create.append(
Embeddings(
DbEntry(
user=user,
embeddings=embedding,
raw=entry.raw,
@ -136,7 +136,7 @@ class TextEmbeddings(ABC):
corpus_id=entry.corpus_id,
)
)
new_embeddings = Embeddings.objects.bulk_create(batch_embeddings_to_create)
new_embeddings = DbEntry.objects.bulk_create(batch_embeddings_to_create)
logger.debug(f"Created {len(new_embeddings)} new embeddings")
num_new_embeddings += len(new_embeddings)
@ -146,26 +146,26 @@ class TextEmbeddings(ABC):
dates = self.date_filter.extract_dates(embedding.raw)
for date in dates:
dates_to_create.append(
EmbeddingsDates(
EntryDates(
date=date,
embeddings=embedding,
)
)
new_dates = EmbeddingsDates.objects.bulk_create(dates_to_create)
new_dates = EntryDates.objects.bulk_create(dates_to_create)
if len(new_dates) > 0:
logger.debug(f"Created {len(new_dates)} new date entries")
with timer("Identify hashes for removed entries", logger):
for file in hashes_by_file:
existing_entry_hashes = EmbeddingsAdapters.get_existing_entry_hashes_by_file(user, file)
existing_entry_hashes = EntryAdapters.get_existing_entry_hashes_by_file(user, file)
to_delete_entry_hashes = set(existing_entry_hashes) - hashes_by_file[file]
num_deleted_embeddings += len(to_delete_entry_hashes)
EmbeddingsAdapters.delete_embedding_by_hash(user, hashed_values=list(to_delete_entry_hashes))
EntryAdapters.delete_entry_by_hash(user, hashed_values=list(to_delete_entry_hashes))
with timer("Identify hashes for deleting entries", logger):
if deletion_filenames is not None:
for file_path in deletion_filenames:
deleted_count = EmbeddingsAdapters.delete_embedding_by_file(user, file_path)
deleted_count = EntryAdapters.delete_entry_by_file(user, file_path)
num_deleted_embeddings += deleted_count
return num_new_embeddings, num_deleted_embeddings
@ -180,11 +180,11 @@ class TextEmbeddings(ABC):
):
# Hash all current and previous entries to identify new entries
with timer("Hash previous, current entries", logger):
current_entry_hashes = list(map(TextEmbeddings.hash_func(key), current_entries))
previous_entry_hashes = list(map(TextEmbeddings.hash_func(key), previous_entries))
current_entry_hashes = list(map(TextToEntries.hash_func(key), current_entries))
previous_entry_hashes = list(map(TextToEntries.hash_func(key), previous_entries))
if deletion_filenames is not None:
deletion_entries = [entry for entry in previous_entries if entry.file in deletion_filenames]
deletion_entry_hashes = list(map(TextEmbeddings.hash_func(key), deletion_entries))
deletion_entry_hashes = list(map(TextToEntries.hash_func(key), deletion_entries))
else:
deletion_entry_hashes = []

View file

@ -48,7 +48,7 @@ from khoj.processor.conversation.gpt4all.chat_model import extract_questions_off
from fastapi.requests import Request
from database import adapters
from database.adapters import EmbeddingsAdapters, ConversationAdapters
from database.adapters import EntryAdapters, ConversationAdapters
from database.models import LocalMarkdownConfig, LocalOrgConfig, LocalPdfConfig, LocalPlaintextConfig, KhojUser
@ -129,7 +129,7 @@ if not state.demo:
@requires(["authenticated"])
def get_config_data(request: Request):
user = request.user.object
EmbeddingsAdapters.get_unique_file_types(user)
EntryAdapters.get_unique_file_types(user)
return state.config
@ -145,7 +145,7 @@ if not state.demo:
configuration_update_metadata = {}
enabled_content = await sync_to_async(EmbeddingsAdapters.get_unique_file_types)(user)
enabled_content = await sync_to_async(EntryAdapters.get_unique_file_types)(user)
if state.config.content_type is not None:
configuration_update_metadata["github"] = "github" in enabled_content
@ -241,9 +241,9 @@ if not state.demo:
raise ValueError(f"Invalid content type: {content_type}")
await content_object.objects.filter(user=user).adelete()
await sync_to_async(EmbeddingsAdapters.delete_all_embeddings)(user, content_type)
await sync_to_async(EntryAdapters.delete_all_entries)(user, content_type)
enabled_content = await sync_to_async(EmbeddingsAdapters.get_unique_file_types)(user)
enabled_content = await sync_to_async(EntryAdapters.get_unique_file_types)(user)
return {"status": "ok"}
@api.post("/delete/config/data/processor/conversation/openai", status_code=200)
@ -372,7 +372,7 @@ def get_config_types(
):
user = request.user.object
enabled_file_types = EmbeddingsAdapters.get_unique_file_types(user)
enabled_file_types = EntryAdapters.get_unique_file_types(user)
configured_content_types = list(enabled_file_types)
@ -706,7 +706,7 @@ async def extract_references_and_questions(
if conversation_type == ConversationCommand.General:
return compiled_references, inferred_queries, q
if not await EmbeddingsAdapters.user_has_embeddings(user=user):
if not await EntryAdapters.user_has_entries(user=user):
logger.warning(
"No content index loaded, so cannot extract references from knowledge base. Please configure your data sources and update the index to chat with your notes."
)

View file

@ -10,12 +10,12 @@ from starlette.authentication import requires
# Internal Packages
from khoj.utils import state, constants
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
from khoj.processor.github.github_to_jsonl import GithubToJsonl
from khoj.processor.notion.notion_to_jsonl import NotionToJsonl
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries
from khoj.processor.org_mode.org_to_entries import OrgToEntries
from khoj.processor.pdf.pdf_to_entries import PdfToEntries
from khoj.processor.github.github_to_entries import GithubToEntries
from khoj.processor.notion.notion_to_entries import NotionToEntries
from khoj.processor.plaintext.plaintext_to_entries import PlaintextToEntries
from khoj.search_type import text_search, image_search
from khoj.routers.helpers import update_telemetry_state
from khoj.utils.yaml import save_config_to_file_updated_state
@ -201,7 +201,7 @@ def configure_content(
logger.info("🦄 Setting up search for orgmode notes")
# Extract Entries, Generate Notes Embeddings
text_search.setup(
OrgToJsonl,
OrgToEntries,
files.get("org"),
regenerate=regenerate,
full_corpus=full_corpus,
@ -216,7 +216,7 @@ def configure_content(
logger.info("💎 Setting up search for markdown notes")
# Extract Entries, Generate Markdown Embeddings
text_search.setup(
MarkdownToJsonl,
MarkdownToEntries,
files.get("markdown"),
regenerate=regenerate,
full_corpus=full_corpus,
@ -232,7 +232,7 @@ def configure_content(
logger.info("🖨️ Setting up search for pdf")
# Extract Entries, Generate PDF Embeddings
text_search.setup(
PdfToJsonl,
PdfToEntries,
files.get("pdf"),
regenerate=regenerate,
full_corpus=full_corpus,
@ -248,7 +248,7 @@ def configure_content(
logger.info("📄 Setting up search for plaintext")
# Extract Entries, Generate Plaintext Embeddings
text_search.setup(
PlaintextToJsonl,
PlaintextToEntries,
files.get("plaintext"),
regenerate=regenerate,
full_corpus=full_corpus,
@ -281,7 +281,7 @@ def configure_content(
logger.info("🐙 Setting up search for github")
# Extract Entries, Generate Github Embeddings
text_search.setup(
GithubToJsonl,
GithubToEntries,
None,
regenerate=regenerate,
full_corpus=full_corpus,
@ -298,7 +298,7 @@ def configure_content(
if (search_type == None or search_type in state.SearchType.Notion.value) and notion_config:
logger.info("🔌 Setting up search for notion")
text_search.setup(
NotionToJsonl,
NotionToEntries,
None,
regenerate=regenerate,
full_corpus=full_corpus,

View file

@ -19,7 +19,7 @@ from khoj.utils.rawconfig import (
# Internal Packages
from khoj.utils import constants, state
from database.adapters import EmbeddingsAdapters, get_user_github_config, get_user_notion_config, ConversationAdapters
from database.adapters import EntryAdapters, get_user_github_config, get_user_notion_config, ConversationAdapters
from database.models import LocalOrgConfig, LocalMarkdownConfig, LocalPdfConfig, LocalPlaintextConfig
@ -34,19 +34,52 @@ VALID_TEXT_CONTENT_TYPES = ["org", "markdown", "pdf", "plaintext"]
@web_client.get("/", response_class=FileResponse)
@requires(["authenticated"], redirect="login_page")
def index(request: Request):
return templates.TemplateResponse("index.html", context={"request": request, "demo": state.demo})
user = request.user.object
user_picture = request.session.get("user", {}).get("picture")
return templates.TemplateResponse(
"index.html",
context={
"request": request,
"demo": state.demo,
"username": user.username,
"user_photo": user_picture,
},
)
@web_client.post("/", response_class=FileResponse)
@requires(["authenticated"], redirect="login_page")
def index_post(request: Request):
return templates.TemplateResponse("index.html", context={"request": request, "demo": state.demo})
user = request.user.object
user_picture = request.session.get("user", {}).get("picture")
return templates.TemplateResponse(
"index.html",
context={
"request": request,
"demo": state.demo,
"username": user.username,
"user_photo": user_picture,
},
)
@web_client.get("/chat", response_class=FileResponse)
@requires(["authenticated"], redirect="login_page")
def chat_page(request: Request):
return templates.TemplateResponse("chat.html", context={"request": request, "demo": state.demo})
user = request.user.object
user_picture = request.session.get("user", {}).get("picture")
return templates.TemplateResponse(
"chat.html",
context={
"request": request,
"demo": state.demo,
"username": user.username,
"user_photo": user_picture,
},
)
@web_client.get("/login", response_class=FileResponse)
@ -84,7 +117,8 @@ if not state.demo:
@requires(["authenticated"], redirect="login_page")
def config_page(request: Request):
user = request.user.object
enabled_content = set(EmbeddingsAdapters.get_unique_file_types(user).all())
user_picture = request.session.get("user", {}).get("picture")
enabled_content = set(EntryAdapters.get_unique_file_types(user).all())
default_full_config = FullConfig(
content_type=None,
search_type=None,
@ -128,7 +162,8 @@ if not state.demo:
"current_config": current_config,
"current_model_state": successfully_configured,
"anonymous_mode": state.anonymous_mode,
"username": user.username if user else None,
"username": user.username,
"user_photo": user_picture,
},
)
@ -136,6 +171,7 @@ if not state.demo:
@requires(["authenticated"], redirect="login_page")
def github_config_page(request: Request):
user = request.user.object
user_picture = request.session.get("user", {}).get("picture")
current_github_config = get_user_github_config(user)
if current_github_config:
@ -158,13 +194,20 @@ if not state.demo:
current_config = {} # type: ignore
return templates.TemplateResponse(
"content_type_github_input.html", context={"request": request, "current_config": current_config}
"content_type_github_input.html",
context={
"request": request,
"current_config": current_config,
"username": user.username,
"user_photo": user_picture,
},
)
@web_client.get("/config/content_type/notion", response_class=HTMLResponse)
@requires(["authenticated"], redirect="login_page")
def notion_config_page(request: Request):
user = request.user.object
user_picture = request.session.get("user", {}).get("picture")
current_notion_config = get_user_notion_config(user)
current_config = NotionContentConfig(
@ -174,7 +217,13 @@ if not state.demo:
current_config = json.loads(current_config.json())
return templates.TemplateResponse(
"content_type_notion_input.html", context={"request": request, "current_config": current_config}
"content_type_notion_input.html",
context={
"request": request,
"current_config": current_config,
"username": user.username,
"user_photo": user_picture,
},
)
@web_client.get("/config/content_type/{content_type}", response_class=HTMLResponse)
@ -185,6 +234,7 @@ if not state.demo:
object = map_config_to_object(content_type)
user = request.user.object
user_picture = request.session.get("user", {}).get("picture")
config = object.objects.filter(user=user).first()
if config == None:
config = object.objects.create(user=user)
@ -202,6 +252,8 @@ if not state.demo:
"request": request,
"current_config": current_config,
"content_type": content_type,
"username": user.username,
"user_photo": user_picture,
},
)
@ -209,6 +261,7 @@ if not state.demo:
@requires(["authenticated"], redirect="login_page")
def conversation_processor_config_page(request: Request):
user = request.user.object
user_picture = request.session.get("user", {}).get("picture")
openai_config = ConversationAdapters.get_openai_conversation_config(user)
if openai_config:
@ -229,5 +282,7 @@ if not state.demo:
context={
"request": request,
"current_config": current_processor_openai_config,
"username": user.username,
"user_photo": user_picture,
},
)

View file

@ -6,31 +6,31 @@ from typing import List, Tuple, Type, Union, Dict
# External Packages
import torch
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from sentence_transformers import util
from asgiref.sync import sync_to_async
# Internal Packages
from khoj.utils import state
from khoj.utils.helpers import get_absolute_path, resolve_absolute_path, load_model, timer
from khoj.utils.helpers import get_absolute_path, timer
from khoj.utils.models import BaseEncoder
from khoj.utils.state import SearchType
from khoj.utils.rawconfig import SearchResponse, Entry
from khoj.utils.jsonl import load_jsonl
from khoj.processor.text_to_jsonl import TextEmbeddings
from database.adapters import EmbeddingsAdapters
from database.models import KhojUser, Embeddings
from khoj.processor.text_to_entries import TextToEntries
from database.adapters import EntryAdapters
from database.models import KhojUser, Entry as DbEntry
logger = logging.getLogger(__name__)
search_type_to_embeddings_type = {
SearchType.Org.value: Embeddings.EmbeddingsType.ORG,
SearchType.Markdown.value: Embeddings.EmbeddingsType.MARKDOWN,
SearchType.Plaintext.value: Embeddings.EmbeddingsType.PLAINTEXT,
SearchType.Pdf.value: Embeddings.EmbeddingsType.PDF,
SearchType.Github.value: Embeddings.EmbeddingsType.GITHUB,
SearchType.Notion.value: Embeddings.EmbeddingsType.NOTION,
SearchType.Org.value: DbEntry.EntryType.ORG,
SearchType.Markdown.value: DbEntry.EntryType.MARKDOWN,
SearchType.Plaintext.value: DbEntry.EntryType.PLAINTEXT,
SearchType.Pdf.value: DbEntry.EntryType.PDF,
SearchType.Github.value: DbEntry.EntryType.GITHUB,
SearchType.Notion.value: DbEntry.EntryType.NOTION,
SearchType.All.value: None,
}
@ -121,7 +121,7 @@ async def query(
# Find relevant entries for the query
top_k = 10
with timer("Search Time", logger, state.device):
hits = EmbeddingsAdapters.search_with_embeddings(
hits = EntryAdapters.search_with_embeddings(
user=user,
embeddings=question_embedding,
max_results=top_k,
@ -188,7 +188,7 @@ def rerank_and_sort_results(hits, query):
def setup(
text_to_jsonl: Type[TextEmbeddings],
text_to_entries: Type[TextToEntries],
files: dict[str, str],
regenerate: bool,
full_corpus: bool = True,
@ -196,11 +196,11 @@ def setup(
config=None,
) -> None:
if config:
num_new_embeddings, num_deleted_embeddings = text_to_jsonl(config).process(
num_new_embeddings, num_deleted_embeddings = text_to_entries(config).process(
files=files, full_corpus=full_corpus, user=user, regenerate=regenerate
)
else:
num_new_embeddings, num_deleted_embeddings = text_to_jsonl().process(
num_new_embeddings, num_deleted_embeddings = text_to_entries().process(
files=files, full_corpus=full_corpus, user=user, regenerate=regenerate
)

View file

@ -13,7 +13,7 @@ app = FastAPI()
# Internal Packages
from khoj.configure import configure_routes, configure_search_types, configure_middleware
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
from khoj.processor.plaintext.plaintext_to_entries import PlaintextToEntries
from khoj.search_type import image_search, text_search
from khoj.utils.config import SearchModels
from khoj.utils.constants import web_directory
@ -26,7 +26,7 @@ from khoj.utils.rawconfig import (
)
from khoj.utils import state, fs_syncer
from khoj.routers.indexer import configure_content
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
from khoj.processor.org_mode.org_to_entries import OrgToEntries
from database.models import (
KhojApiUser,
LocalOrgConfig,
@ -134,7 +134,7 @@ def content_config(tmp_path_factory, search_models: SearchModels, default_user:
user=default_user,
)
text_search.setup(OrgToJsonl, get_sample_data("org"), regenerate=False, user=default_user)
text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=False, user=default_user)
if os.getenv("GITHUB_PAT_TOKEN"):
GithubConfig.objects.create(
@ -242,7 +242,7 @@ def client(
# These lines help us Mock the Search models for these search types
state.search_models.image_search = image_search.initialize_model(search_config.image)
text_search.setup(
OrgToJsonl,
OrgToEntries,
get_sample_data("org"),
regenerate=False,
user=api_user.user,
@ -251,7 +251,7 @@ def client(
content_config.image, state.search_models.image_search, regenerate=False
)
text_search.setup(
PlaintextToJsonl,
PlaintextToEntries,
get_sample_data("plaintext"),
regenerate=False,
user=api_user.user,

View file

@ -15,9 +15,9 @@ from khoj.utils import state
from khoj.utils.state import search_models, content_index, config
from khoj.search_type import text_search, image_search
from khoj.utils.rawconfig import ContentConfig, SearchConfig
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
from khoj.processor.org_mode.org_to_entries import OrgToEntries
from database.models import KhojUser
from database.adapters import EmbeddingsAdapters
from database.adapters import EntryAdapters
# Test
@ -176,9 +176,9 @@ def test_regenerate_with_github_fails_without_pat(client):
@pytest.mark.skip(reason="Flaky test on parallel test runs")
def test_get_configured_types_via_api(client, sample_org_data):
# Act
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False)
text_search.setup(OrgToEntries, sample_org_data, regenerate=False)
enabled_types = EmbeddingsAdapters.get_unique_file_types(user=None).all().values_list("file_type", flat=True)
enabled_types = EntryAdapters.get_unique_file_types(user=None).all().values_list("file_type", flat=True)
# Assert
assert list(enabled_types) == ["org"]
@ -189,7 +189,7 @@ def test_get_configured_types_via_api(client, sample_org_data):
def test_get_api_config_types(client, sample_org_data, default_user: KhojUser):
# Arrange
headers = {"Authorization": "Bearer kk-secret"}
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user)
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
# Act
response = client.get(f"/api/config/types", headers=headers)
@ -255,7 +255,7 @@ def test_image_search(client, content_config: ContentConfig, search_config: Sear
def test_notes_search(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser):
# Arrange
headers = {"Authorization": "Bearer kk-secret"}
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user)
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
user_query = quote("How to git install application?")
# Act
@ -276,7 +276,7 @@ def test_notes_search_with_only_filters(
# Arrange
headers = {"Authorization": "Bearer kk-secret"}
text_search.setup(
OrgToJsonl,
OrgToEntries,
sample_org_data,
regenerate=False,
user=default_user,
@ -298,7 +298,7 @@ def test_notes_search_with_only_filters(
def test_notes_search_with_include_filter(client, sample_org_data, default_user: KhojUser):
# Arrange
headers = {"Authorization": "Bearer kk-secret"}
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user)
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
user_query = quote('How to git install application? +"Emacs"')
# Act
@ -317,7 +317,7 @@ def test_notes_search_with_exclude_filter(client, sample_org_data, default_user:
# Arrange
headers = {"Authorization": "Bearer kk-secret"}
text_search.setup(
OrgToJsonl,
OrgToEntries,
sample_org_data,
regenerate=False,
user=default_user,
@ -339,7 +339,7 @@ def test_notes_search_with_exclude_filter(client, sample_org_data, default_user:
def test_different_user_data_not_accessed(client, sample_org_data, default_user: KhojUser):
# Arrange
headers = {"Authorization": "Bearer kk-token"} # Token for default_user2
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user)
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
user_query = quote("How to git install application?")
# Act

View file

@ -4,7 +4,7 @@ from pathlib import Path
import os
# Internal Packages
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries
from khoj.utils.fs_syncer import get_markdown_files
from khoj.utils.rawconfig import TextContentConfig
@ -23,11 +23,11 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
# Act
# Extract Entries from specified Markdown files
entry_nodes, file_to_entries = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
entry_nodes, file_to_entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
# Process Each Entry from All Notes Files
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
MarkdownToJsonl.convert_markdown_entries_to_maps(entry_nodes, file_to_entries)
jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(
MarkdownToEntries.convert_markdown_entries_to_maps(entry_nodes, file_to_entries)
)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
@ -52,11 +52,11 @@ def test_single_markdown_entry_to_jsonl(tmp_path):
# Act
# Extract Entries from specified Markdown files
entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
entries, entry_to_file_map = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
# Process Each Entry from All Notes Files
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map)
jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(
MarkdownToEntries.convert_markdown_entries_to_maps(entries, entry_to_file_map)
)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
@ -81,11 +81,11 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path):
# Act
# Extract Entries from specified Markdown files
entry_strings, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
entries = MarkdownToJsonl.convert_markdown_entries_to_maps(entry_strings, entry_to_file_map)
entry_strings, entry_to_file_map = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
entries = MarkdownToEntries.convert_markdown_entries_to_maps(entry_strings, entry_to_file_map)
# Process Each Entry from All Notes Files
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(entries)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@ -144,7 +144,7 @@ def test_extract_entries_with_different_level_headings(tmp_path):
# Act
# Extract Entries from specified Markdown files
entries, _ = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
entries, _ = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
# Assert
assert len(entries) == 2

View file

@ -3,8 +3,8 @@ import json
import os
# Internal Packages
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
from khoj.processor.text_to_jsonl import TextEmbeddings
from khoj.processor.org_mode.org_to_entries import OrgToEntries
from khoj.processor.text_to_entries import TextToEntries
from khoj.utils.helpers import is_none_or_empty
from khoj.utils.rawconfig import Entry
from khoj.utils.fs_syncer import get_org_files
@ -29,9 +29,9 @@ def test_configure_heading_entry_to_jsonl(tmp_path):
for index_heading_entries in [True, False]:
# Act
# Extract entries into jsonl from specified Org files
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
OrgToJsonl.convert_org_nodes_to_entries(
*OrgToJsonl.extract_org_entries(org_files=data), index_heading_entries=index_heading_entries
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(
OrgToEntries.convert_org_nodes_to_entries(
*OrgToEntries.extract_org_entries(org_files=data), index_heading_entries=index_heading_entries
)
)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
@ -59,12 +59,12 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
# Act
# Extract Entries from specified Org files
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=data)
entries, entry_to_file_map = OrgToEntries.extract_org_entries(org_files=data)
# Split each entry from specified Org files by max words
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
TextEmbeddings.split_entries_by_max_tokens(
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(
TextToEntries.split_entries_by_max_tokens(
OrgToEntries.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4
)
)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
@ -86,7 +86,7 @@ def test_entry_split_drops_large_words():
# Act
# Split entry by max words and drop words larger than max word length
processed_entry = TextEmbeddings.split_entries_by_max_tokens([entry], max_word_length=5)[0]
processed_entry = TextToEntries.split_entries_by_max_tokens([entry], max_word_length=5)[0]
# Assert
# "Heading" dropped from compiled version because its over the set max word limit
@ -109,11 +109,11 @@ def test_entry_with_body_to_jsonl(tmp_path):
# Act
# Extract Entries from specified Org files
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=data)
entries, entry_to_file_map = OrgToEntries.extract_org_entries(org_files=data)
# Process Each Entry from All Notes Files
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map)
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(
OrgToEntries.convert_org_nodes_to_entries(entries, entry_to_file_map)
)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
@ -136,11 +136,11 @@ Intro text
# Act
# Extract Entries from specified Org files
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=data)
entry_nodes, file_to_entries = OrgToEntries.extract_org_entries(org_files=data)
# Process Each Entry from All Notes Files
entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries)
entries = OrgToEntries.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@ -160,11 +160,11 @@ def test_file_with_no_headings_to_jsonl(tmp_path):
# Act
# Extract Entries from specified Org files
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=data)
entry_nodes, file_to_entries = OrgToEntries.extract_org_entries(org_files=data)
# Process Each Entry from All Notes Files
entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries)
entries = OrgToEntries.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@ -224,7 +224,7 @@ def test_extract_entries_with_different_level_headings(tmp_path):
# Act
# Extract Entries from specified Org files
entries, _ = OrgToJsonl.extract_org_entries(org_files=data)
entries, _ = OrgToEntries.extract_org_entries(org_files=data)
# Assert
assert len(entries) == 2

View file

@ -3,7 +3,7 @@ import json
import os
# Internal Packages
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
from khoj.processor.pdf.pdf_to_entries import PdfToEntries
from khoj.utils.fs_syncer import get_pdf_files
from khoj.utils.rawconfig import TextContentConfig
@ -18,11 +18,11 @@ def test_single_page_pdf_to_jsonl():
pdf_bytes = f.read()
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data)
# Process Each Entry from All Pdf Files
jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl(
PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map)
jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(
PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map)
)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
@ -38,11 +38,11 @@ def test_multi_page_pdf_to_jsonl():
pdf_bytes = f.read()
data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data)
# Process Each Entry from All Pdf Files
jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl(
PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map)
jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(
PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map)
)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]

View file

@ -6,7 +6,7 @@ from pathlib import Path
# Internal Packages
from khoj.utils.fs_syncer import get_plaintext_files
from khoj.utils.rawconfig import TextContentConfig
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
from khoj.processor.plaintext.plaintext_to_entries import PlaintextToEntries
from database.models import LocalPlaintextConfig, KhojUser
@ -27,14 +27,14 @@ def test_plaintext_file(tmp_path):
f"{plaintextfile}": entry,
}
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(entry_to_file_map=data)
maps = PlaintextToEntries.convert_plaintext_entries_to_maps(entry_to_file_map=data)
# Convert each entry.file to absolute path to make them JSON serializable
for map in maps:
map.file = str(Path(map.file).absolute())
# Process Each Entry from All Notes Files
jsonl_string = PlaintextToJsonl.convert_entries_to_jsonl(maps)
jsonl_string = PlaintextToEntries.convert_entries_to_jsonl(maps)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@ -100,7 +100,7 @@ def test_parse_html_plaintext_file(content_config, default_user: KhojUser):
extracted_plaintext_files = get_plaintext_files(config=config)
# Act
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(extracted_plaintext_files)
maps = PlaintextToEntries.convert_plaintext_entries_to_maps(extracted_plaintext_files)
# Assert
assert len(maps) == 1

View file

@ -1,6 +1,5 @@
# System Packages
import logging
import locale
from pathlib import Path
import os
import asyncio
@ -11,10 +10,10 @@ import pytest
# Internal Packages
from khoj.search_type import text_search
from khoj.utils.rawconfig import ContentConfig, SearchConfig
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
from khoj.processor.github.github_to_jsonl import GithubToJsonl
from khoj.processor.org_mode.org_to_entries import OrgToEntries
from khoj.processor.github.github_to_entries import GithubToEntries
from khoj.utils.fs_syncer import collect_files, get_org_files
from database.models import LocalOrgConfig, KhojUser, Embeddings, GithubConfig
from database.models import LocalOrgConfig, KhojUser, Entry, GithubConfig
logger = logging.getLogger(__name__)
@ -66,7 +65,7 @@ def test_text_search_setup_with_empty_file_raises_error(
# Act
# Generate notes embeddings during asymmetric setup
with caplog.at_level(logging.INFO):
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
assert "Created 0 new embeddings. Deleted 3 embeddings for user " in caplog.records[-1].message
verify_embeddings(0, default_user)
@ -81,7 +80,7 @@ def test_text_indexer_deletes_embedding_before_regenerate(
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
data = get_org_files(org_config)
with caplog.at_level(logging.DEBUG):
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
# Assert
assert "Deleting all embeddings for file type org" in caplog.text
@ -95,7 +94,7 @@ def test_text_search_setup_batch_processes(content_config: ContentConfig, defaul
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
data = get_org_files(org_config)
with caplog.at_level(logging.DEBUG):
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
# Assert
assert "Created 4 new embeddings" in caplog.text
@ -113,13 +112,13 @@ def test_text_index_same_if_content_unchanged(content_config: ContentConfig, def
# Act
# Generate initial notes embeddings during asymmetric setup
with caplog.at_level(logging.DEBUG):
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
initial_logs = caplog.text
caplog.clear() # Clear logs
# Run asymmetric setup again with no changes to data source. Ensure index is not updated
with caplog.at_level(logging.DEBUG):
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user)
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
final_logs = caplog.text
# Assert
@ -149,7 +148,7 @@ async def test_text_search(search_config: SearchConfig):
await loop.run_in_executor(
None,
text_search.setup,
OrgToJsonl,
OrgToEntries,
data,
True,
True,
@ -186,7 +185,7 @@ def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: LocalOrgCon
# Act
# reload embeddings, entries, notes model after adding new org-mode file
with caplog.at_level(logging.INFO):
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user)
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
# Assert
# verify newly added org-mode entry is split by max tokens
@ -219,7 +218,7 @@ conda activate khoj
#+end_src"""
}
text_search.setup(
OrgToJsonl,
OrgToEntries,
data,
regenerate=False,
user=default_user,
@ -238,7 +237,7 @@ conda activate khoj
# reload embeddings, entries, notes model after adding new org-mode file
with caplog.at_level(logging.INFO):
text_search.setup(
OrgToJsonl,
OrgToEntries,
data,
regenerate=False,
full_corpus=False,
@ -260,7 +259,7 @@ def test_regenerate_index_with_new_entry(
data = get_org_files(org_config)
with caplog.at_level(logging.INFO):
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
assert "Created 10 new embeddings. Deleted 3 embeddings for user " in caplog.records[-1].message
@ -274,7 +273,7 @@ def test_regenerate_index_with_new_entry(
# Act
# regenerate notes jsonl, model embeddings and model to include entry from new file
with caplog.at_level(logging.INFO):
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
# Assert
assert "Created 11 new embeddings. Deleted 10 embeddings for user " in caplog.records[-1].message
@ -299,7 +298,7 @@ def test_update_index_with_duplicate_entries_in_stable_order(
# Act
# generate embeddings, entries, notes model from scratch after adding new org-mode file
with caplog.at_level(logging.INFO):
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
initial_logs = caplog.text
caplog.clear() # Clear logs
@ -307,7 +306,7 @@ def test_update_index_with_duplicate_entries_in_stable_order(
# update embeddings, entries, notes model with no new changes
with caplog.at_level(logging.INFO):
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user)
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
final_logs = caplog.text
# Assert
@ -332,7 +331,7 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrg
# load embeddings, entries, notes model after adding new org file with 2 entries
with caplog.at_level(logging.INFO):
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
initial_logs = caplog.text
caplog.clear() # Clear logs
@ -344,7 +343,7 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrg
# Act
with caplog.at_level(logging.INFO):
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user)
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
final_logs = caplog.text
# Assert
@ -362,7 +361,7 @@ def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
data = get_org_files(org_config)
with caplog.at_level(logging.INFO):
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
initial_logs = caplog.text
caplog.clear() # Clear logs
@ -376,7 +375,7 @@ def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file
# Act
# update embeddings, entries with the newly added note
with caplog.at_level(logging.INFO):
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user)
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
final_logs = caplog.text
# Assert
@ -394,7 +393,7 @@ def test_text_search_setup_github(content_config: ContentConfig, default_user: K
# Act
# Regenerate github embeddings to test asymmetric setup without caching
text_search.setup(
GithubToJsonl,
GithubToEntries,
{},
regenerate=True,
user=default_user,
@ -402,10 +401,10 @@ def test_text_search_setup_github(content_config: ContentConfig, default_user: K
)
# Assert
embeddings = Embeddings.objects.filter(user=default_user, file_type="github").count()
embeddings = Entry.objects.filter(user=default_user, file_type="github").count()
assert embeddings > 1
def verify_embeddings(expected_count, user):
embeddings = Embeddings.objects.filter(user=user, file_type="org").count()
embeddings = Entry.objects.filter(user=user, file_type="org").count()
assert embeddings == expected_count