This commit is contained in:
2025-11-19 19:43:19 +08:00
parent ee7c7b6263
commit 83afd253b3

View File

@@ -0,0 +1,62 @@
一、安装全文检索库
pip install xapian-bindings-binary -i https://pypi.tuna.tsinghua.edu.cn/simple
文档:
https://xapian.org/docs/bindings/python3/
二、安装分词库
pip install jieba
三、建立索引
def create_index(words):
index_db = xapian.WritableDatabase("xapian.data",xapian.DB_CREATE_OR_OPEN)
for i,data in enumerate(words):
seg_list = jieba.cut_for_search(data)
seg_list_clean = list(set(seg_list).difference(set(bad_chars)))
print(seg_list_clean)
doc = xapian.Document()
doc.set_data(data)
for seg in seg_list_clean:
#print seg
doc.add_term(seg)
doc.add_value(0,f"文档{i}")
index_db.add_document(doc)
index_db.flush()
index_db.close()
同时注意Term too long (> 245)将引发错误, seg = seg[0:240]
四、查询
#带语法的查询,英文的,使用结巴分词对语义做了处理
def search_index2(keyword):
#分词
seg_list = jieba.cut_for_search(keyword)
seg_list_clean = list(set(seg_list).difference(set(bad_chars)))
keyword = " ".join(seg_list_clean)
#打开数据库
db = xapian.Database("xapian.data")
enquire =xapian.Enquire(db)
#查询
qp = xapian.QueryParser()
stemmer = xapian.Stem("english")
qp.set_stemmer(stemmer)
qp.set_database(db)
#qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
qp.set_stemming_strategy(xapian.QueryParser.STEM_ALL)
query = qp.parse_query(keyword)
#query =xapian.Query(keyword)
print("Parsed query is: %s" % str(query))
enquire.set_query(query)
matches = enquire.get_mset(0,50)
print(matches.size())
for m in matches:
print("%s %i %i%% docid=%i [%s]" % (m.document.get_value(0).decode("utf-8"),m.rank+1,m.percent,m.docid,m.document.get_data().decode("utf-8")))