From 83afd253b30fd9699970f9371bbeccb81f965114 Mon Sep 17 00:00:00 2001 From: 13315423919 <13315423919@qq.com> Date: Wed, 19 Nov 2025 19:43:19 +0800 Subject: [PATCH] Add File --- doc/全文检索笔记.txt | 62 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 doc/全文检索笔记.txt diff --git a/doc/全文检索笔记.txt b/doc/全文检索笔记.txt new file mode 100644 index 0000000..8f61658 --- /dev/null +++ b/doc/全文检索笔记.txt @@ -0,0 +1,62 @@ +一、安装全文检索库 + pip install xapian-bindings-binary -i https://pypi.tuna.tsinghua.edu.cn/simple + + 文档: + https://xapian.org/docs/bindings/python3/ + +二、安装分词库 + pip install jieba + +三、建立索引 +def create_index(words): + + index_db = xapian.WritableDatabase("xapian.data",xapian.DB_CREATE_OR_OPEN) + + for i,data in enumerate(words): + seg_list = jieba.cut_for_search(data) + seg_list_clean = list(set(seg_list).difference(set(bad_chars))) + print(seg_list_clean) + + doc = xapian.Document() + doc.set_data(data) + for seg in seg_list_clean: + #print seg + doc.add_term(seg) + doc.add_value(0,f"文档{i}") + index_db.add_document(doc) + index_db.flush() + index_db.close() + +同时注意Term too long (> 245)将引发错误, seg = seg[0:240] + +四、查询 +#带语法的查询,英文的,使用结巴分词对语义做了处理 +def search_index2(keyword): + + #分词 + seg_list = jieba.cut_for_search(keyword) + seg_list_clean = list(set(seg_list).difference(set(bad_chars))) + keyword = " ".join(seg_list_clean) + + #打开数据库 + db = xapian.Database("xapian.data") + enquire =xapian.Enquire(db) + + #查询 + qp = xapian.QueryParser() + stemmer = xapian.Stem("english") + qp.set_stemmer(stemmer) + qp.set_database(db) + #qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) + qp.set_stemming_strategy(xapian.QueryParser.STEM_ALL) + query = qp.parse_query(keyword) + + #query =xapian.Query(keyword) + print("Parsed query is: %s" % str(query)) + + enquire.set_query(query) + matches = enquire.get_mset(0,50) + print(matches.size()) + for m in matches: + print("%s %i %i%% docid=%i [%s]" % (m.document.get_value(0).decode("utf-8"),m.rank+1,m.percent,m.docid,m.document.get_data().decode("utf-8"))) +