一、安装全文检索库 pip install xapian-bindings-binary -i https://pypi.tuna.tsinghua.edu.cn/simple 文档: https://xapian.org/docs/bindings/python3/ 二、安装分词库 pip install jieba 三、建立索引 def create_index(words): index_db = xapian.WritableDatabase("xapian.data",xapian.DB_CREATE_OR_OPEN) for i,data in enumerate(words): seg_list = jieba.cut_for_search(data) seg_list_clean = list(set(seg_list).difference(set(bad_chars))) print(seg_list_clean) doc = xapian.Document() doc.set_data(data) for seg in seg_list_clean: #print seg doc.add_term(seg) doc.add_value(0,f"文档{i}") index_db.add_document(doc) index_db.flush() index_db.close() 同时注意Term too long (> 245)将引发错误, seg = seg[0:240] 四、查询 #带语法的查询,英文的,使用结巴分词对语义做了处理 def search_index2(keyword): #分词 seg_list = jieba.cut_for_search(keyword) seg_list_clean = list(set(seg_list).difference(set(bad_chars))) keyword = " ".join(seg_list_clean) #打开数据库 db = xapian.Database("xapian.data") enquire =xapian.Enquire(db) #查询 qp = xapian.QueryParser() stemmer = xapian.Stem("english") qp.set_stemmer(stemmer) qp.set_database(db) #qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) qp.set_stemming_strategy(xapian.QueryParser.STEM_ALL) query = qp.parse_query(keyword) #query =xapian.Query(keyword) print("Parsed query is: %s" % str(query)) enquire.set_query(query) matches = enquire.get_mset(0,50) print(matches.size()) for m in matches: print("%s %i %i%% docid=%i [%s]" % (m.document.get_value(0).decode("utf-8"),m.rank+1,m.percent,m.docid,m.document.get_data().decode("utf-8")))