k3GPT/main/web_search.py

import requests
from lxml import html,etree
import re

#百度网页搜索
def baidu_web(keywords):
    # 百度搜索的URL模板
    url = "https://www.baidu.com/s"

    # 搜索关键词
    params = {
        "wd": keywords
    }

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0 Safari/537.36"
    }

    response = requests.get(url, params=params, headers=headers)


    # 设置编码格式（自动检测）
    response.encoding = response.apparent_encoding

    pages=[]
    # 确保响应正常
    if response.status_code == 200:
        #print(response.content.decode("utf-8"))
        # 使用 lxml 解析 HTML
        tree = html.fromstring(response.content)

        # 使用 XPath 提取所有搜索结果标题和链接
        results = tree.xpath('//div[@id="content_left"]/div[contains(@class, "result")]')

        for result in results:
            title = result.xpath('.//h3/a//text()')
            link = result.xpath('.//h3/a/@href')

            title = ''.join(title).strip() if title else ''
            link = link[0] if link else ''
            

            # 使用 XPath 查找 data-module="abstract" 的 div
            desc = result.xpath('.//div[@data-module="abstract"]//text()')
            desc = ''.join(desc).strip() if title else ''
            # print("标题:", title)
            # print("链接:", link)
            # print("摘要:", desc)
            # print("-" * 80)
            if title and link:
                pages.append({"title":title,"link":link,"desc":desc})
    else:
        print(f"请求失败，状态码：{response.status_code}")

    return pages


"""
单独的网页
"""
def get_single_page(url,size=2048):
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0 Safari/537.36"
    }

    try:
        response = requests.get(url, headers=headers)
        # 设置编码格式（自动检测）
        response.encoding = response.apparent_encoding
        # 确保响应正常
        if response.status_code == 200:
            #print(response.text)
            tree = html.fromstring(response.text)
            all_text=""
            # 获取<body>标签下的所有文本
            body = tree.find('.//body')
            if body is not None:
                for tag in body.xpath('.//script | .//style | .//noscript'):
                    parent = tag.getparent()
                    if parent is not None:
                        parent.remove(tag)

                # 可选：移除内联事件（如 onclick）
                for elem in body.xpath('.//*[@onclick or @onload or @onerror]'):
                    # 移除特定属性
                    if 'onclick' in elem.attrib:
                        del elem.attrib['onclick']
                    if 'onload' in elem.attrib:
                        del elem.attrib['onload']
                all_text = body.text_content()
                all_text = all_text.strip()
                #print(all_text)

            # 提取 title
            title = tree.xpath('//title/text()')
            title = title[0] if title else ""

            # 提取 keywords
            keywords = tree.xpath('//meta[@name="keywords"]/@content')
            keywords = keywords[0] if keywords else ""

            result={"code":200,"url":url,"title":title,"keywords":keywords,"content":all_text[0:size]}
    except Exception as e :
        result={"code":500,"url":url,"content":f'{e}'},None
    return result,response.text

#百度网页搜索
def cn_bing_web(keywords):
    # 百度搜索的URL模板
    url = "https://cn.bing.com/search"

    # 搜索关键词
    params = {
        "q": keywords
    }

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0 Safari/537.36"
    }

    response = requests.get(url, params=params, headers=headers)


    # 设置编码格式（自动检测）
    response.encoding = response.apparent_encoding

    pages=[]
    # 确保响应正常
    if response.status_code == 200:
        #print(response.content.decode("utf-8"))
        # 使用 lxml 解析 HTML
        tree = html.fromstring(response.text)

        search_items = tree.xpath('//li[@class="b_algo"]')

        for item in search_items:
            title = item.xpath('.//h2/a/text()')
            link = item.xpath('.//h2/a/@href')

            title = ''.join(title).strip() if title else ''
            link = link[0] if link else ''

            if title and link:
                pages.append({"title":title,"link":link,"desc":title})
            
    else:
        print(f"请求失败，状态码：{response.status_code}")

    return pages


#联网搜索的主入口
def search_web(keywords):
    r0 = sogou_search(keywords)
    r1 = baidu_web(keywords)
    r2 = cn_bing_web(keywords)
    print(f"搜狗[{len(r0)}],百度[{len(r1)}],必应[{len(r2)}]===={keywords}")
    return r0+r1+r2

"""
页面的详细内容
"""
def get_detail_page(pages,size=2048):
    result=[]
    for page in pages:
        url = page["link"]

        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0 Safari/537.36"
        }

        try:
            response = requests.get(url, headers=headers)
            # 设置编码格式（自动检测）
            response.encoding = response.apparent_encoding
            # 确保响应正常
            if response.status_code == 200:
                tree = html.fromstring(response.text)
                # 获取<body>标签下的所有文本
                body_element = tree.find('.//body')
                if body_element is not None:
                    all_text = body_element.text_content()
                    result.append({"url":page["link"],"title":page["title"],"desc":page["desc"],"content":all_text[0:size]})
        except:
            pass
    #end for
    return result


import asyncio
import aiohttp

#单个页面
async def fetch(session, url):
    try:
        async with session.get(url) as resp:
            if resp.status == 200:
                text = await resp.text()
                body = html.fromstring(text)
                # 增加对sogo重定向页面的处理

                noscript_meta = body.xpath('//noscript/meta[@http-equiv="refresh"]/@content')
                if noscript_meta:
                    content = noscript_meta[0]
                    # 匹配 URL='xxx' 或 URL=xxx
                    match = re.search(r'URL\s*=\s*["\']?([^"\'>\s]+)', content, re.IGNORECASE)
                    if match:
                        url = match.group(1)
                        return  await fetch(session,url)

                # 获取<body>标签下的所有文本
                for tag in body.xpath('.//script | .//style | .//noscript'):
                    parent = tag.getparent()
                    if parent is not None:
                        parent.remove(tag)

                # 可选：移除内联事件（如 onclick）
                for elem in body.xpath('.//*[@onclick or @onload or @onerror]'):
                    # 移除特定属性
                    if 'onclick' in elem.attrib:
                        del elem.attrib['onclick']
                    if 'onload' in elem.attrib:
                        del elem.attrib['onload']
                all_text = body.text_content()
                all_text = all_text.strip()
                return {"url": url, "content": all_text[0:6000]}
    except:
        pass
#任务池
async def task_pool(urls):
    # 设置全局 headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0 Safari/537.36"
    }

    async with aiohttp.ClientSession(headers=headers) as session:
        tasks = [fetch(session, url) for url in urls]
        results = await asyncio.gather(*tasks)  # 并发执行，收集结果
    
    return results  # 返回所有结果

#异步请求
def get_detail_page2(pages):
    result = asyncio.run(task_pool([page["link"] for page in pages]))
    return [r for r in result if r ]


#
def test_web(keywords):
    # 百度搜索的URL模板
    url = "https://www.sogou.com/web"

    # 搜索关键词
    params = {
        "query": keywords
    }

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0 Safari/537.36"
    }

    response = requests.get(url, params=params, headers=headers)

    print(response.text)


def sogou_search(keyword, timeout=10):
    """
    模拟搜狗网页搜索，返回解析后的结果列表
    :param keyword: 搜索关键词
    :param page: 页码 (从1开始)
    :param timeout: 请求超时时间
    :return: 搜索结果列表 [{title, url, abstract}, ...]
    """
    # 搜狗搜索URL构造
    url = f"https://www.sogou.com/web"

    # 搜索关键词
    params = {
        "query": keyword
    }


    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Referer': 'https://www.sogou.com/',
    }

    try:
        response = requests.get(url, params=params,headers=headers, timeout=timeout)
        response.raise_for_status()
        response.encoding = 'utf-8'

        html = etree.HTML(response.text)

        results = []
        # 搜狗搜索结果容器（可能随前端改版而变化）
        result_nodes = html.xpath('//div[@class="vrwrap"] | //div[@class="rb"]')

        for node in result_nodes:
            try:
                # 标题
                title_node = node.xpath('.//h3/a')
                if not title_node:
                    continue
                title = ''.join(title_node[0].xpath('.//text()')).strip()

                # 链接
                href = title_node[0].get('href', '')
                # 搜狗的链接是跳转链接，需要处理
                if href.startswith('/link?url='):
                    full_url = 'https://www.sogou.com' + href
                    # 可选：请求一次获取真实URL（会增加延迟和请求量）
                    real_url = get_real_url(full_url, headers, timeout)
                else:
                    real_url = href

                # 摘要/描述
                abstract_parts = node.xpath('.//div[contains(@class, "fz-mid")]//text() | .//p[contains(@class, "str_info")]//text()')
                abstract = ''.join(abstract_parts).strip().replace('\n', '').replace('\r', '').replace(' ', '')

                if title and real_url:
                    results.append({
                        'title': title,
                        'link': real_url,
                        'desc': abstract
                    })
            except Exception as e:
                print(f"[解析单条结果出错]: {e}")
                continue

        return results

    except Exception as e:
        print(f"[请求失败]: {e}")
        return []

def get_real_url(sogou_redirect_url, headers, timeout=10):
    """
    （可选）访问搜狗跳转链接，获取真实目标URL
    注意：频繁请求可能触发反爬
    """
    try:
        r = requests.get(sogou_redirect_url, headers=headers, timeout=timeout, allow_redirects=True)
        return r.url
    except:
        return sogou_redirect_url  # 失败则返回原跳转链接


if __name__ == "__main__":
    # keyword = "4b模型"
    # print(f"正在搜索: {keyword}")

    # results = sogou_search(keyword)

    # for i, res in enumerate(results, 1):
    #     print(f"\n{i}. {res['title']}")
    #     print(f"   URL: {res['url']}")
    #     print(f"   摘要: {res['abstract'][:100]}...")

    # print(f"\n共找到 {len(results)} 条结果")
    import sys

    
    question = "云智信安"
    print("=========================Search=============================")
    #r = cn_bing_web(question)
    r = search_web(sys.argv[1])

    
    print(len(r),r)
    print("=========================Detail=============================")
    pages = get_detail_page2(r)


    for page in pages:
        print(page)

    print(len(pages))

    """
    # r2 = get_detail_page(r)
    # print(r2)
    print("===========================BAIDU===========================")
    r1 = baidu_web(question)
    print(len(r1),r1)
    # r2 = get_detail_page(r)
    # print(r2)
    print("===========================R2===========================")
    r4 = search_web(question)
    print(len(r4),r4)
    print("===========================get_detail_page2===========================")
    r5 = get_detail_page2(r4)
    print(len(r5),r5)    
    """