ailine/backend/app/core/web_search.py

"""
联网搜索公共工具 - 无需 API Key，免费使用 DuckDuckGo
Web Search Public Utility - Free, no API Key, using DuckDuckGo
"""

from typing import List, Dict, Any, Optional
from dataclasses import dataclass
from datetime import datetime
import requests
import warnings
import re


@dataclass
class SearchResult:
    """搜索结果数据类"""
    title: str
    url: str
    snippet: str
    source: str = "DuckDuckGo"
    timestamp: datetime = None

    def __post_init__(self):
        if self.timestamp is None:
            self.timestamp = datetime.now()


class WebSearchTool:
    """联网搜索公共工具类"""

    def __init__(self, max_results: int = 5):
        self.max_results = max_results

    def search(self, query: str, max_results: Optional[int] = None) -> List[SearchResult]:
        """
        使用多种方式搜索，按优先级尝试

        Args:
            query: 搜索关键词
            max_results: 返回结果数量，默认使用初始化时的设置

        Returns:
            搜索结果列表
        """
        num_results = max_results or self.max_results

        # 方式 1: Tavily (需要 API Key，质量最高)
        try:
            return self._search_tavily(query, num_results)
        except ImportError:
            print("[WebSearch] tavily 未安装，尝试其他搜索方式")
        except Exception as e:
            if "API_KEY" in str(e) or "未配置" in str(e):
                print(f"[WebSearch] Tavily API Key 未配置: {e}")
            else:
                print(f"[WebSearch] Tavily 搜索失败: {e}")

        # 方式 2: 尝试用 ddgs 包
        try:
            from ddgs import DDGS
            print(f"[WebSearch] 使用 ddgs 搜索: {query}")
            with DDGS() as ddgs:
                results = list(ddgs.text(query, max_results=num_results))
                if results:
                    search_results = []
                    for r in results:
                        search_results.append(SearchResult(
                            title=r.get("title", ""),
                            url=r.get("href", ""),
                            snippet=r.get("body", ""),
                            source="DuckDuckGo"
                        ))
                    print(f"[WebSearch] ddgs 返回 {len(search_results)} 条结果")
                    return search_results
        except ImportError:
            print("[WebSearch] ddgs 未安装，尝试 duckduckgo-search")
        except Exception as e:
            print(f"[WebSearch] ddgs 搜索失败: {e}")

        # 方式 3: 尝试用简单 HTTP 请求
        try:
            return self._search_http(query, num_results)
        except Exception as e:
            print(f"[WebSearch] HTTP 搜索也失败: {e}")

        # 方式 4: 返回模拟数据作为最后兜底
        return self._search_mock(query, num_results)

    def _search_tavily(self, query: str, max_results: int) -> List[SearchResult]:
        """使用 Tavily API 搜索"""
        from tavily import TavilyClient
        from app.config import TAVILY_API_KEY, TAVILY_MAX_RESULTS

        if not TAVILY_API_KEY:
            raise ValueError("TAVILY_API_KEY 未配置")

        client = TavilyClient(api_key=TAVILY_API_KEY)
        response = client.search(
            query=query,
            max_results=min(max_results, TAVILY_MAX_RESULTS or 5),
            include_answer=True,
            include_raw_content=False
        )

        results = []
        for item in response.get("results", []):
            results.append(SearchResult(
                title=item.get("title", ""),
                url=item.get("url", ""),
                snippet=item.get("content", ""),
                source="Tavily"
            ))

        print(f"[WebSearch] Tavily 返回 {len(results)} 条结果")
        return results

    def _search_http(self, query: str, max_results: int) -> List[SearchResult]:
        """用简单 HTTP 请求搜索（备用方案）- 尝试多个国内源"""
        print(f"[WebSearch] 尝试 HTTP 搜索")

        # 方式 1: 尝试百度搜索（简单方式）
        try:
            return self._search_baidu(query, max_results)
        except Exception as e:
            print(f"[WebSearch] 百度搜索失败: {e}")

        # 方式 2: 返回模拟数据
        return self._search_mock(query, max_results)

    def _search_baidu(self, query: str, max_results: int) -> List[SearchResult]:
        """尝试百度搜索"""
        import requests
        from urllib.parse import quote

        url = f"https://www.baidu.com/s?wd={quote(query)}"
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        }

        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()

            # 简单解析百度搜索结果（简化版）
            results = []
            # 这里只是示意，真实百度搜索需要更复杂的解析
            results.append(SearchResult(
                title=f"百度搜索: {query}",
                url=url,
                snippet="如需要真实搜索结果，请考虑使用百度搜索 API",
                source="百度"
            ))
            return results
        except Exception as e:
            print(f"[WebSearch] 百度搜索也失败: {e}")
            raise

    def _search_mock(self, query: str, max_results: Optional[int] = None) -> List[SearchResult]:
        """模拟搜索结果（兜底方案）"""
        print(f"[WebSearch] 使用模拟搜索结果 (查询: {query})")

        # 根据查询内容生成更有意义的模拟结果
        mock_templates = [
            {
                "title": f"关于「{query}」的相关介绍",
                "snippet": "这是模拟结果。如需真实搜索，请检查容器网络连接或配置代理。",
                "url": "https://example.com/about"
            },
            {
                "title": f"「{query}」 - 最新动态",
                "snippet": "提示：在容器内运行时，需要确保能访问外网。",
                "url": "https://example.com/latest"
            },
            {
                "title": f"了解更多关于「{query}」的内容",
                "snippet": "建议：检查 Docker 网络配置，或使用代理。",
                "url": "https://example.com/more"
            }
        ]

        num = max_results or self.max_results
        results = []

        for i, template in enumerate(mock_templates[:num]):
            results.append(SearchResult(
                title=template["title"],
                url=template["url"],
                snippet=template["snippet"],
                source="模拟数据"
            ))

        return results

    def format_search_results(self, results: List[SearchResult]) -> str:
        """
        格式化搜索结果（带引用溯源）

        Args:
            results: 搜索结果列表

        Returns:
            格式化后的 Markdown 文本
        """
        if not results:
            return "未找到相关搜索结果"

        lines = []
        lines.append("## 🔍 联网搜索结果\n")

        for idx, result in enumerate(results, 1):
            lines.append(f"### [{idx}] {result.title}")
            lines.append(f"- 🔗 来源：[{result.url}]({result.url})")
            lines.append(f"- 📝 摘要：{result.snippet}")
            lines.append(f"- 📅 时间：{result.timestamp.strftime('%Y-%m-%d %H:%M:%S')}")
            lines.append("")

        # 添加引用溯源说明
        lines.append("---")
        lines.append("💡 **引用溯源说明**：")
        lines.append("- 以上搜索结果均标注了来源链接")
        lines.append("- 使用方括号数字标识引用（如 [1]、[2]）")
        lines.append("- 可通过链接追溯原始信息")

        return "\n".join(lines)


# 单例实例
_web_search_tool = None


def get_web_search_tool() -> WebSearchTool:
    """获取联网搜索工具单例"""
    global _web_search_tool
    if _web_search_tool is None:
        _web_search_tool = WebSearchTool()
    return _web_search_tool


def web_search(query: str, max_results: int = 5) -> str:
    """
    便捷函数：联网搜索并返回格式化结果

    Args:
        query: 搜索关键词
        max_results: 返回结果数量

    Returns:
        格式化后的搜索结果文本
    """
    tool = get_web_search_tool()
    results = tool.search(query, max_results)
    return tool.format_search_results(results)