Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 18 additions & 19 deletions apps/knowledge/sql/blend_search.sql
Original file line number Diff line number Diff line change
@@ -1,28 +1,27 @@
WITH vector_top AS (
SELECT id,
paragraph_id,
(embedding::vector(%s) <=> %s) AS distance
FROM embedding ${embedding_query}
ORDER BY (embedding::vector(%s) <=> %s)
LIMIT LEAST(%s * 10, 500)
)
SELECT
paragraph_id,
comprehensive_score,
comprehensive_score AS similarity
FROM
(
SELECT DISTINCT ON
( "paragraph_id" ) ( 1 - distance + ts_similarity ) as similarity, *,
(1 - distance + ts_similarity) AS comprehensive_score
(vc.paragraph_id) vc.paragraph_id,
(1 - vc.distance + COALESCE(ts_rank_cd(e.search_vector, websearch_to_tsquery('simple', %s), 32), 0)) AS comprehensive_score
FROM
(
SELECT
*,
(embedding.embedding::vector(%s) <=> %s) as distance,
(ts_rank_cd( embedding.search_vector, websearch_to_tsquery('simple', %s ), 32 )) AS ts_similarity
FROM
embedding ${embedding_query}
ORDER BY distance
) TEMP
vector_top vc
JOIN embedding e ON e.id = vc.id
ORDER BY
paragraph_id,
similarity DESC
) DISTINCT_TEMP
WHERE
comprehensive_score >%s
ORDER BY
comprehensive_score DESC
LIMIT %s
vc.paragraph_id,
comprehensive_score DESC
) sub
WHERE comprehensive_score>%s
ORDER BY comprehensive_score DESC
LIMIT %s
20 changes: 14 additions & 6 deletions apps/knowledge/sql/embedding_search.sql
Original file line number Diff line number Diff line change
@@ -1,17 +1,25 @@
WITH vector_top AS (
SELECT paragraph_id,
(embedding::vector(%s) <=> %s) AS distance
FROM embedding ${embedding_query}
ORDER BY (embedding::vector(%s) <=> %s)
LIMIT LEAST(%s * 10, 500)
)
SELECT
paragraph_id,
comprehensive_score,
comprehensive_score as similarity
FROM
(
SELECT DISTINCT ON
("paragraph_id") ( 1 - distance ),* ,(1 - distance) AS comprehensive_score
(vc.paragraph_id) vc.paragraph_id,
(1 - vc.distance) AS comprehensive_score
FROM
( SELECT *, ( embedding.embedding::vector(%s) <=> %s ) AS distance FROM embedding ${embedding_query} ORDER BY distance) TEMP
vector_top vc
ORDER BY
paragraph_id,
distance
) DISTINCT_TEMP
vc.paragraph_id,
comprehensive_score DESC
) sub
WHERE comprehensive_score>%s
ORDER BY comprehensive_score DESC
LIMIT %s
LIMIT %s
5 changes: 3 additions & 2 deletions apps/knowledge/sql/keywords_search.sql
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@ FROM
SELECT DISTINCT ON
("paragraph_id") ( similarity ),* ,similarity AS comprehensive_score
FROM
( SELECT *,ts_rank_cd(embedding.search_vector,websearch_to_tsquery('simple',%s),32) AS similarity FROM embedding ${keywords_query}) TEMP
( SELECT *,ts_rank_cd(embedding.search_vector,websearch_to_tsquery('simple',%s),32) AS similarity FROM embedding ${keywords_query}
AND COALESCE(search_vector @@ websearch_to_tsquery('simple',%s), false)) TEMP
ORDER BY
paragraph_id,
similarity DESC
) DISTINCT_TEMP
WHERE comprehensive_score>%s
ORDER BY comprehensive_score DESC
LIMIT %s
LIMIT %s
69 changes: 50 additions & 19 deletions apps/knowledge/vector/pg_vector.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,15 +123,27 @@ def hit_test(
exclude_dict = {}
query_text = normalize_for_embedding(query_text)
embedding_query = embedding.embed_query(query_text)
query_set = QuerySet(Embedding).filter(knowledge_id__in=knowledge_id_list, is_active=True)
if exclude_document_id_list is not None and len(exclude_document_id_list) > 0:
exclude_dict.__setitem__("document_id__in", exclude_document_id_list)
query_set = query_set.exclude(**exclude_dict)
for search_handle in search_handle_list:
if search_handle.support(search_mode):
return search_handle.handle(
query_set, query_text, embedding_query, top_number, similarity, search_mode, knowledge_id_list
)
# Query per knowledge base to leverage per-KB partial HNSW indexes
# (WHERE knowledge_id = '{k_id}'), which won't be used with knowledge_id__in
if len(knowledge_id_list) == 1:
query_set = QuerySet(Embedding).filter(knowledge_id=knowledge_id_list[0], is_active=True).exclude(**exclude_dict)
return search_handle.handle(
query_set, query_text, embedding_query, top_number, similarity, search_mode, knowledge_id_list
)
else:
all_results = []
for kid in knowledge_id_list:
query_set = QuerySet(Embedding).filter(knowledge_id=kid, is_active=True).exclude(**exclude_dict)
results = search_handle.handle(
query_set, query_text, embedding_query, top_number, similarity, search_mode, knowledge_id_list
)
all_results.extend(results)
all_results.sort(key=lambda x: x.get("similarity", x.get("comprehensive_score", 0)), reverse=True)
return all_results[:top_number]

def query(
self,
Expand All @@ -149,19 +161,35 @@ def query(
exclude_dict = {}
if knowledge_id_list is None or len(knowledge_id_list) == 0:
return []
query_set = QuerySet(Embedding).filter(knowledge_id__in=knowledge_id_list, is_active=is_active)
if document_id_list is not None and len(document_id_list) > 0:
query_set = query_set.filter(document_id__in=document_id_list)
if exclude_document_id_list is not None and len(exclude_document_id_list) > 0:
query_set = query_set.exclude(document_id__in=exclude_document_id_list)
if exclude_paragraph_list is not None and len(exclude_paragraph_list) > 0:
query_set = query_set.exclude(paragraph_id__in=exclude_paragraph_list)
query_set = query_set.exclude(**exclude_dict)
for search_handle in search_handle_list:
if search_handle.support(search_mode):
return search_handle.handle(
query_set, query_text, query_embedding, top_n, similarity, search_mode, knowledge_id_list
)
# Query per knowledge base to leverage per-KB partial HNSW indexes
# (WHERE knowledge_id = '{k_id}'), which won't be used with knowledge_id__in
def build_query_set(kid):
qs = QuerySet(Embedding).filter(knowledge_id=kid, is_active=is_active)
if document_id_list is not None and len(document_id_list) > 0:
qs = qs.filter(document_id__in=document_id_list)
if exclude_document_id_list is not None and len(exclude_document_id_list) > 0:
qs = qs.exclude(document_id__in=exclude_document_id_list)
if exclude_paragraph_list is not None and len(exclude_paragraph_list) > 0:
qs = qs.exclude(paragraph_id__in=exclude_paragraph_list)
qs = qs.exclude(**exclude_dict)
return qs
if len(knowledge_id_list) == 1:
query_set = build_query_set(knowledge_id_list[0])
return search_handle.handle(
query_set, query_text, query_embedding, top_n, similarity, search_mode, knowledge_id_list
)
else:
all_results = []
for kid in knowledge_id_list:
query_set = build_query_set(kid)
results = search_handle.handle(
query_set, query_text, query_embedding, top_n, similarity, search_mode, knowledge_id_list
)
all_results.extend(results)
all_results.sort(key=lambda x: x.get("similarity", x.get("comprehensive_score", 0)), reverse=True)
return all_results[:top_n]

def update_by_source_id(self, source_id: str, instance: Dict):
QuerySet(Embedding).filter(source_id=source_id).update(**instance)
Expand Down Expand Up @@ -236,7 +264,7 @@ def handle(
with_table_name=True,
)
embedding_model = select_list(
exec_sql, [len(query_embedding), json.dumps(query_embedding), *exec_params, similarity, top_number]
exec_sql, [len(query_embedding), json.dumps(query_embedding), *exec_params, len(query_embedding), json.dumps(query_embedding), top_number, similarity, top_number]
)
return embedding_model

Expand Down Expand Up @@ -268,7 +296,7 @@ def handle(
else None
)
embedding_model = select_list(
exec_sql, [to_query(query_text, user_words=terms), *exec_params, similarity, top_number]
exec_sql, [to_query(query_text, user_words=terms), *exec_params, to_query(query_text, user_words=terms), similarity, top_number]
)
return embedding_model

Expand Down Expand Up @@ -302,8 +330,11 @@ def handle(
[
len(query_embedding),
json.dumps(query_embedding),
to_query(query_text, user_words=terms),
*exec_params,
len(query_embedding),
json.dumps(query_embedding),
top_number,
to_query(query_text, user_words=terms),
similarity,
top_number,
],
Expand Down
Loading