Files
k3GPT/main/full_index_search.py

701 lines
22 KiB
Python
Raw Normal View History

2025-11-19 19:42:53 +08:00
import xapian
import jieba
from xapian import Weight,BoolWeight,TradWeight,BM25Weight,BM25PlusWeight
from datetime import datetime
import time
from tika import parser
import json
#初始化信息
from init import *
from k_database import Doc
from operator import itemgetter
from second_search import cnt_keywords,retrieve_result
bad_chars = [
'', '', '', '', '', '', '', '', '', '', '', '', '或者', '因为', '所以', '如果',
'不但', '并且', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '我们', '你们', '他们', '她们', '它们', '', '', '这些', '那些', '', '什么', '', '哪里', '',
'', '', '', '已经', '曾经', '总是', '', '哎呀', '', '', '', '哗啦', '咔嚓', '喵呜',"",
'',
'',' ',',','','','','"',"'","(",")","","",
"/","~","^","-",".","\r","\n","\t","NULL","null",'[',']','{','}',
"\r\n",":"," ",'',
] #结巴分词需要过滤的坏字符
def load_stopwords(filepath):
with open(filepath, 'r', encoding='utf-8') as f:
stopwords = list(line.strip() for line in f)
stopwords.extend(bad_chars)
return set(stopwords)
# 使用函数加载停用词
stopwords_file = 'stopwords.txt' # 替换为你的停用词文件路径
stopwords = load_stopwords(stopwords_file)
def_words=["1月","2月","3月","4月","5月","6月","7月","8月","9月","10月","11月","12月"]
for word in def_words:
jieba.add_word(word)
#jieba.load_userdict("vocab.txt")
#分词并去除停用词,且是顺序处理
def jieba_fenci(data):
#分词
seg_list = jieba.lcut(data)
seg_list_clean = list(set(seg_list).difference(set(stopwords)))
#顺序处理
seg_list = [item for item in seg_list if item in seg_list_clean]
return seg_list
def open_db():
for i in range(3):
try:
index_db = xapian.WritableDatabase(f"{gcfg['fs']['path']}/xapian.data",xapian.DB_CREATE_OR_OPEN)
break
except:
time.sleep(1)
return index_db
def open_db_readonly():
db = xapian.Database(f"{gcfg['fs']['path']}/xapian.data")
return db
#自定义的匹配关键词数量的算法
class KeywordCountWeight(BM25Weight):
def __init__(self, query):
super().__init__()
self.query = query
print(query)
# 创建一个集合,包含所有查询词
self.query_terms = set()
for term in self.query:
self.query_terms.add(term)
def get_sump(self, doclen, rsv):
print("ss")
# 默认行为,可以在这里添加额外逻辑
return super().get_sump(doclen, rsv)
def get_sumextra(self,wdf,rsv,ss):
print("ss")
def get_sumq(self, wdf, rsv):
print(rsv)
# 默认行为,可以在这里添加额外逻辑
return super().get_sumq(wdf, rsv)
def get_maxweight(self):
print("ss")
# 默认行为,可以在这里添加额外逻辑
return super().get_maxweight()
def get_weight(self, docid, wdf, rsv):
# 获取文档对象
doc = rsv.get_document()
# 计算匹配的查询词数量
matched_terms_count = 0
for term in doc.termlist():
if term.term in self.query_terms:
matched_terms_count += 1
print(doc,matched_terms_count)
# 基础权重可以根据需要调整,这里简单地使用匹配词数量作为权重
return matched_terms_count
#根据文档的分片建立全文索引,分片大小有上层控制
def create_full_index(words,file_name,file_path,timestamp,source="知识库"):
index_db = open_db()
for i,data in enumerate(words):
seg_list = jieba_fenci(data)
doc = xapian.Document()
doc.set_data(data)
for seg in seg_list:
seg = seg[0:240]
doc.add_term(seg)
doc.add_value(0,f"{file_name}")
doc.add_value(1,f"{file_path}")
doc.add_value(2,str(timestamp))
doc.add_value(3,source)
index_db.add_document(doc)
index_db.flush()
index_db.close()
#知识百科的创建全文索引,不分片
def c_or_u_baike_index(data,title,baike_id,catalog):
seg_list = jieba_fenci(f"知识百科:{catalog}/{title}\n{data}")
# 删除之前的百科全文索引文档
try:
delete_fs_index_by_base_path("知识百科",baike_id)
except:
pass
#创建
index_db = open_db()
doc = xapian.Document()
doc.set_data(f"知识百科:{catalog}/{title}\n{data}")
for seg in seg_list:
seg = seg[0:240]
doc.add_term(seg)
doc.add_value(0,f"{title}")
doc.add_value(1,f"{baike_id}")
doc.add_value(2,str(datetime.now().timestamp()))
doc.add_value(3,"知识百科") #知识百科
doc.add_value(4,catalog) #分类
doc_id = index_db.add_document(doc)
index_db.flush()
index_db.close()
return doc_id
#删除一个全文索引的分片
def delete_index_by_docid(full_id):
index_db = open_db()
# 删除文档
try:
if full_id !=0:
index_db.delete_document(full_id)
except:
pass
index_db.flush()
index_db.close()
#删除全文索引的分片 根据一个列表
def delete_index_by_doc_list(doc_list):
index_db = open_db()
# 删除文档
try:
for full_id in doc_list:
index_db.delete_document(full_id)
except:
pass
index_db.flush()
index_db.close()
#删除文档中心一个文档所有的docid,name是来源或分类
def delete_fs_index_by_base_path(name,path):
#打开数据库(只读)
db = open_db_readonly()
enquire =xapian.Enquire(db)
query = xapian.Query("")
enquire.set_query(query)
mdecider = base_path_matchdecider(f"{name}_{path}")
matches = enquire.get_mset(0, 10000, None, mdecider)
ids = [m.docid for m in matches ]
delete_index_by_doc_list(ids)
return ids
#文件中心的创建全文索引20K固定分片
def c_or_u_fs_index(file_name,file_path,st_size,owner="Admin"):
try:
raw = parser.from_file(file_path)
except Exception as e :
#raise Exception(f"Tika解析出错,{e}")
return 0
data = raw['content']
meta = raw['metadata']
if data==None or data=="":
return 0
#raise Exception("文件内容不能识别")
try:
Doc.create(base="文件中心",abs_path=file_path,
f_name=file_name,
f_size=st_size,
ctype= meta["Content-Type"],
catalog="文件中心",
author=owner,
meta=json.dumps(meta)[:1000]
)
except:
#重复的文件,删除原有索引
delete_fs_index_by_base_path("文件中心",file_path)
#创建
st=datetime.now()
index_db = open_db()
#自行切片
words= split_text_by_n(data)
for text in words:
doc = xapian.Document()
seg_list = jieba_fenci(f"文件:{file_path}\n{text}")
doc.set_data(f"文件:{file_path}\n{text}") #这个信息有助于大模型了解内容的出处
for seg in seg_list:
seg = seg[0:240]
doc.add_term(seg)
doc.add_value(0,f"{file_name}")
doc.add_value(1,f"{file_path}")
doc.add_value(2,str(st.timestamp()))
doc.add_value(3,"文件中心") #来源
doc_id = index_db.add_document(doc)
index_db.flush()
index_db.close()
return doc_id
#自定义的文本分片,切割的时候回溯最后的回车符号的位置
def split_text_by_n(text,n=20480):
words=[]
if len(text) <=n:
words.append(text)
return words
line_e="" #上次剩余的量
for i in range(0,len(text),n):
line = line_e + text[i:i+n]
ends = line.rfind("\n")
if ends==-1:
words.append(line)
line_e = ""
else:
words.append(line[0:ends])
line_e=line[ends:]
if line_e !="":
words.append(line_e)
return words
#全文索引的文档,一篇文章一个索引
def create_full_index2(data,file_name,file_path,timestamp):
index_db = open_db()
seg_list = jieba_fenci(data)
doc = xapian.Document()
doc.set_data(data)
for seg in seg_list:
seg = seg[0:240]
doc.add_term(seg)
doc.add_value(0,f"{file_name}")
doc.add_value(1,f"{file_path}")
doc.add_value(2,str(timestamp))
index_db.add_document(doc)
index_db.flush()
index_db.close()
##########################################知识对话#############################################
#控制上下文总大小
def adjust_ctx_size(context0):
sum_len=0
context=[]
for i,text in enumerate(context0):
if not isinstance(text,str):
text = str(text)
if sum_len+len(text) < gcfg["llm"]["ctx_size"]:
context.append(f"内容{i+1}:\n{text}")
sum_len +=len(text)
else:
context.append(f'内容{i+1}:\n{text[0:10]}...')
#去掉break保证每个分片都有内容可以为空这样在前面也好核查本次查询结果
#break
#print(context)
return context
#知识对话,快问快答的主入口
def full_search(keyword,percent=50):
r,m = full_search_logic(keyword,percent,[],xapian.Query.OP_AND_MAYBE)
return r,m
#使用结巴分词或关键字列表,进行搜索,可以指定搜索关系,并且进行了二次检索
def full_search_logic(keyword,percent=60,seg_list=[],logic=xapian.Query.OP_AND):
#分词
if len(seg_list)==0:
seg_list = jieba_fenci(keyword)
#打开数据库(只读)
db = open_db_readonly()
enquire =xapian.Enquire(db)
#and 查询
query = xapian.Query(logic, seg_list)
#query = xapian.Query(xapian.Query.OP_OR, seg_list)
print(f"Parsed query is: {query}")
enquire.set_query(query)
# 设置权重算法
#enquire.set_weighting_scheme(BM25Weight())
enquire.set_weighting_scheme(BM25PlusWeight())
#enquire.set_weighting_scheme(TradWeight())
#先相关度再按文件日期
enquire.set_sort_by_relevance_then_value(2,True)
#enquire.set_sort_by_value_then_relevance(2,True)
matches = enquire.get_mset(0,50)
result,meta = retrieve_result(keyword,seg_list,matches,percent)
print("提供",len(result),"分片",meta)
if len(result)==0 and logic == xapian.Query.OP_AND_MAYBE:
result,meta = full_search_or(keyword,30)
return result,meta
#使用结巴分词对语义做了处理or查询
def full_search_or(keyword,percent=30):
#分词
seg_list = jieba_fenci(keyword)
#打开数据库(只读)
db = open_db_readonly()
enquire =xapian.Enquire(db)
#and 查询
#query = xapian.Query(xapian.Query.OP_AND_MAYBE, seg_list)
query = xapian.Query(xapian.Query.OP_OR, seg_list)
print("补充搜索: %s" % str(query))
enquire.set_query(query)
## 设置权重算法
enquire.set_weighting_scheme(BM25PlusWeight())
#先相关度再按文件日期
enquire.set_sort_by_relevance_then_value(2,True)
#只要2个
matches = enquire.get_mset(0,5)
#for m in matches:
# print("%s %i %i%% docid=%i" % (m.document.get_value(0).decode("utf-8"),m.rank+1,m.percent,m.docid))
result = [m.document.get_data().decode("utf-8") for m in matches if m.percent >percent]
meta = [[m.document.get_value(0).decode("utf-8"),m.document.get_value(1).decode("utf-8"),m.document.get_value(3).decode("utf-8")] for m in matches if m.percent >percent]
print("找到分片",len(meta),meta)
return result,meta
#带语法的查询,英文的,使用结巴分词对语义做了处理
def full_search2(keyword,percent=60):
#分词
seg_list_clean = jieba_fenci(keyword)
keyword = " ".join(seg_list_clean)
#打开数据库(只读)
db = open_db_readonly()
enquire =xapian.Enquire(db)
#查询
qp = xapian.QueryParser()
stemmer = xapian.Stem("english")
qp.set_stemmer(stemmer)
qp.set_database(db)
#qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
#qp.set_stemming_strategy(xapian.QueryParser.STEM_ALL)
qp.set_stemming_strategy(xapian.QueryParser.STEM_NONE)
query = qp.parse_query(keyword)
#query =xapian.Query(keyword)
print("Parsed query is: %s" % str(query))
enquire.set_query(query)
matches = enquire.get_mset(0,20)
result = [m.document.get_data().decode("utf-8") for m in matches if m.percent >percent]
meta = [m.document.get_value(0).decode("utf-8") for m in matches if m.percent >percent]
return result,meta
##----------------------------------------------------
#知识库中固定的文件
class base_path_matchdecider(xapian.MatchDecider):
def __init__(self, base_path):
xapian.MatchDecider.__init__(self)
self.base_path = base_path
def __call__(self, doc):
#base + path, 3+1
doc_info = f'{doc.get_value(3).decode("utf-8")}_{doc.get_value(1).decode("utf-8")}'
return doc_info == self.base_path
#知识百科中某一分类的词条
class baike_catalog_matchdecider(xapian.MatchDecider):
def __init__(self, catalog):
xapian.MatchDecider.__init__(self)
self.catalog = catalog
def __call__(self, doc):
#分类
catalog = doc.get_value(4).decode("utf-8")
return catalog == self.catalog
#一个文档的搜索,带语法的查询,英文的,使用结巴分词对语义做了处理
def one_doc_search(keyword,base,file,path):
#分词
seg_list_clean = jieba_fenci(keyword)
#打开数据库(只读)
db = open_db_readonly()
enquire =xapian.Enquire(db)
query = xapian.Query(xapian.Query.OP_OR, seg_list_clean)
#query = xapian.Query(xapian.Query.OP_OR, seg_list)
print("Parsed query is: %s" % str(query))
# 设置权重算法
#enquire.set_weighting_scheme(BM25Weight())
enquire.set_weighting_scheme(BM25PlusWeight())
#enquire.set_weighting_scheme(TradWeight())
enquire.set_query(query)
mdecider = base_path_matchdecider(f"{base}_{path}")
matches = enquire.get_mset(0, 20, None, mdecider)
#matches = enquire.get_mset(0, 20)
result = [m.document.get_data().decode("utf-8") for m in matches if m.percent >30]
#print(len(matches),result)
if len(result)==0:
return one_doc_0(f"{base}_{path}")
else:
print(f"找到{len(result)}个分片")
return result
#单个文件的所有数据
def one_doc_0(base_path):
#打开数据库(只读)
db = open_db_readonly()
enquire =xapian.Enquire(db)
query = xapian.Query("")
enquire.set_query(query)
mdecider = base_path_matchdecider(base_path)
matches = enquire.get_mset(0, 50, None, mdecider)
result = [m.document.get_data().decode("utf-8") for m in matches if m.percent >30]
print(f"找到单个整个文档:{base_path}{len(result)}个分片")
return result
##----------------------------------------------------
#KAgent 配套的函数从doc_list中进行匹配
##----------------------------------------------------
#文档列表的匹配文件
class doc_list_matchdecider(xapian.MatchDecider):
def __init__(self, doc_list):
xapian.MatchDecider.__init__(self)
self.doc_list = doc_list
def __call__(self, doc):
if self.doc_list:
#base + path
doc_info = f'{doc.get_value(3).decode("utf-8")}_{doc.get_value(1).decode("utf-8")}'
return doc_info in self.doc_list
else:
return True
#根据文件列表进行搜索
def full_search_by_doc_list(keyword,doc_list):
# #单个文档,给默认的内容
if len(doc_list)==1:
return one_doc_0(doc_list[0])
#分词
seg_list_clean = jieba_fenci(keyword)
#打开数据库(只读)
db = open_db_readonly()
enquire =xapian.Enquire(db)
query = xapian.Query(xapian.Query.OP_OR, seg_list_clean)
#print("Parsed query is: %s" % str(query),f"文档列表{doc_list}")
print("Parsed query is: %s" % str(query))
# 设置权重算法
enquire.set_weighting_scheme(BM25PlusWeight())
enquire.set_query(query)
mdecider = doc_list_matchdecider(doc_list)
matches = enquire.get_mset(0, 50, None, mdecider)
#可信度要超过30的就处理
result,meta = retrieve_result(keyword,seg_list_clean,matches,30)
print(f"找到{len(result)}个分片")
return result
#根据百科的分类进行搜索
def full_search_by_baike_catalog(keyword,catalog):
#分词
seg_list_clean = jieba_fenci(keyword)
#打开数据库(只读)
db = open_db_readonly()
enquire =xapian.Enquire(db)
query = xapian.Query(xapian.Query.OP_OR, seg_list_clean)
print("Parsed query is: %s" % str(query))
# 设置权重算法
enquire.set_weighting_scheme(BM25PlusWeight())
enquire.set_query(query)
mdecider = baike_catalog_matchdecider(catalog)
matches = enquire.get_mset(0, 1000, None, mdecider)
result0 = [m.document.get_data().decode("utf-8") for m in matches if m.percent >60]
#内容去重
r_hash=[]
result=[]
for r in result0:
if hash(r) in r_hash:
continue
else:
result.append(r)
print(f"找到 {len(result)} 个百科词条")
return result
##----------------------------------------------------
####################################################################################
#使用结巴分词对语义做了处理,
# 知识搜索的主入口
def doc_search(keyword,percent,logic=xapian.Query.OP_AND):
keyword = keyword.strip()
#分词
seg_list = jieba_fenci(keyword)
#打开数据库(只读)
db = open_db_readonly()
enquire =xapian.Enquire(db)
#and 查询
query = xapian.Query(logic,seg_list)
#query = xapian.Query(xapian.Query.OP_AND_MAYBE,seg_list)
#query = xapian.Query(xapian.Query.OP_OR, seg_list)
print("Parsed query is: %s" % str(query))
enquire.set_query(query)
# 设置权重算法
#enquire.set_weighting_scheme(BM25Weight())
enquire.set_weighting_scheme(BM25PlusWeight())
#enquire.set_weighting_scheme(TradWeight())
#先相关度再按文件日期
enquire.set_sort_by_relevance_then_value(2,True)
#enquire.set_sort_by_value_then_relevance(2,True)
matches = enquire.get_mset(0,100)
estimated_matches = matches.get_matches_estimated()
result={}
r_cnt={}
for m in matches:
if m.percent >=percent:
r={"id":m.docid,
"name": m.document.get_value(0).decode("utf-8"),
"path": m.document.get_value(1).decode("utf-8"),
"base": m.document.get_value(3).decode("utf-8"),
"fdate":datetime.fromtimestamp(float(m.document.get_value(2).decode("utf-8"))).strftime('%Y-%m-%d'),
"content":highlight_terms(m.document.get_data().decode("utf-8"),seg_list)}
result[m.docid] = r
r_cnt[m.docid] = cnt_keywords(r["path"],seg_list)
# 按值降序排序
sorted_cnt = sorted(r_cnt.items(), key=itemgetter(1), reverse=True)
sorted_r_cnt = dict(sorted_cnt)
r0=[]
for doc_id,cnt in sorted_r_cnt.items():
r0.append(result[doc_id])
#保底策略,没有搜索到换成or搜索
if len(r0)==0 and logic== xapian.Query.OP_AND:
estimated_matches,r0 =doc_search(keyword,60,xapian.Query.OP_OR)
return len(r0),r0
return estimated_matches,r0
import re
from html import escape
def highlight_terms(text, terms):
"""
高亮显示文本中的所有指定词项
:param text: 文档的原始文本
:param terms: 包含查询词项的集合
:return: 包含高亮关键词的HTML字符串
"""
# 创建一个用于高亮的正则表达式模式
for term in terms:
if term in text:
text = text.replace(term, f"<span style='background-color: yellow;'>{term}</span>")
return scan_highlight(text)
#高亮打标记
def scan_highlight(text):
result=[]
start=0
for i in range(20): #前20个高亮标记
b = text[start:].find("</span>")
if b>0:
line_start=text[start:start+b].rfind("\n")
if line_start ==-1:
line_start=start
line_end=text[start+b:].find("\n")
if line_end ==-1:#未找到,这句到最后啦
line=text[start+line_start:]
result.append(line)
break
else:
line=text[start+line_start:start+b+line_end]
#print(start,line_start,line_end,line)
result.append(line)
start +=b+line_end
result = "\n".join(result)
return result[0:400]
if __name__=="__main__":
words=[
"大模型的幻觉问题由来已久,本质上还是其模型结构决定的。在数据分类分级场景中会涉及各行各业的数据、专业术语等在数据识别和分类分级时极易产生幻觉,甚至相同的数据不同的提示词或者多次的运行都会出现结果上的差异,如何降低这种幻觉达到工程应用的标准是迫切解决的问题。",
"大模型的应用中更多的是依靠提示词来完成指定的动作和功能,但目前的大模型对提示词的遵从性上表现的并不好,特别是对于否定逻辑的语义理解上。如让大模型生成一个不戴眼镜的照片时,大模型的理解往往不够准确。如何提高大模型的指令的遵从性对于提高准确度非常关键。",
"使用大模型来生成程序代码已经比较普遍了,如何利用这些模型来识别已有代码中是否存在安全漏洞,逻辑错误等问题,这种检测对于源代码安全、供应链安全存在重大意义。",
]
req = "检索个人空间中关于交付功能的数据,增加一列人工工时,用于核算成本报价,最后将结果保存到个人空间中"
print(jieba_fenci(req))