Add File

2025-11-07 09:05:25 +08:00
parent 9c881ad463
commit d36f3b0b74
1 changed files with 281 additions and 0 deletions
--- a/src/landppt/services/research/content_extractor.py
+++ b/src/landppt/services/research/content_extractor.py
@@ -0,0 +1,281 @@
 """
 Web Content Extraction Pipeline for Research Functionality
 This module provides robust web content extraction using BeautifulSoup to fetch
 and parse HTML content from web pages, with proper error handling and content cleaning.
 """
 import asyncio
 import logging
 import re
 import time
 from typing import Dict, List, Optional, Any, Set
 from urllib.parse import urljoin, urlparse
 import aiohttp
 from bs4 import BeautifulSoup, Comment
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from ...core.config import ai_config
 logger = logging.getLogger(__name__)
 class ExtractedContent:
    """Represents extracted content from a web page"""
    def __init__(self, url: str, title: str = "", content: str = "", 
                 metadata: Optional[Dict[str, Any]] = None):
        self.url = url
        self.title = title
        self.content = content
        self.metadata = metadata or {}
        self.extraction_time = time.time()
        self.word_count = len(content.split()) if content else 0
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for serialization"""
        return {
            'url': self.url,
            'title': self.title,
            'content': self.content,
            'metadata': self.metadata,
            'extraction_time': self.extraction_time,
            'word_count': self.word_count
        }
 class WebContentExtractor:
    """Web content extraction pipeline using BeautifulSoup"""
    def __init__(self):
        self.timeout = ai_config.research_extraction_timeout
        self.max_content_length = ai_config.research_max_content_length
        self.user_agent = "LandPPT Research Bot 1.0"
        # Content selectors for different types of content
        self.content_selectors = [
            'article',
            'main',
            '.content',
            '.post-content',
            '.entry-content',
            '.article-content',
            '.story-body',
            '.post-body',
            '#content',
            '#main-content'
        ]
        # Tags to remove completely
        self.remove_tags = {
            'script', 'style', 'nav', 'header', 'footer', 'aside',
            'advertisement', 'ads', 'sidebar', 'menu', 'popup'
        }
        # Text splitter for long content
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.max_content_length,
            chunk_overlap=200,
            length_function=len,
            separators=["\n\n", "\n", ". ", " ", ""]
        )
    def _clean_text(self, text: str) -> str:
        """Clean and normalize extracted text"""
        if not text:
            return ""
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove common unwanted patterns
        text = re.sub(r'(Cookie|Privacy) Policy.*?(?=\n|$)', '', text, flags=re.IGNORECASE)
        text = re.sub(r'Subscribe.*?newsletter.*?(?=\n|$)', '', text, flags=re.IGNORECASE)
        text = re.sub(r'Follow us on.*?(?=\n|$)', '', text, flags=re.IGNORECASE)
        text = re.sub(r'Share this.*?(?=\n|$)', '', text, flags=re.IGNORECASE)
        # Remove URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
        return text.strip()
    def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
        """Extract metadata from HTML"""
        metadata = {}
        # Basic metadata
        if soup.title:
            metadata['title'] = soup.title.string.strip() if soup.title.string else ""
        # Meta tags
        meta_tags = soup.find_all('meta')
        for tag in meta_tags:
            name = tag.get('name') or tag.get('property')
            content = tag.get('content')
            if name and content:
                metadata[name] = content
        # Language
        html_tag = soup.find('html')
        if html_tag and html_tag.get('lang'):
            metadata['language'] = html_tag.get('lang')
        # Domain
        parsed_url = urlparse(url)
        metadata['domain'] = parsed_url.netloc
        return metadata
    def _extract_main_content(self, soup: BeautifulSoup) -> str:
        """Extract main content from HTML using various strategies"""
        # Remove unwanted tags
        for tag_name in self.remove_tags:
            for tag in soup.find_all(tag_name):
                tag.decompose()
        # Remove comments
        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
            comment.extract()
        # Try content selectors in order of preference
        for selector in self.content_selectors:
            content_element = soup.select_one(selector)
            if content_element:
                text = content_element.get_text(separator=' ', strip=True)
                if len(text) > 100:  # Minimum content length
                    return self._clean_text(text)
        # Fallback: extract from body
        body = soup.find('body')
        if body:
            # Remove navigation, sidebar, and footer elements
            for element in body.find_all(['nav', 'aside', 'footer', 'header']):
                element.decompose()
            text = body.get_text(separator=' ', strip=True)
            return self._clean_text(text)
        # Last resort: get all text
        return self._clean_text(soup.get_text(separator=' ', strip=True))
    async def extract_content(self, url: str) -> Optional[ExtractedContent]:
        """
        Extract content from a single URL
        Args:
            url: URL to extract content from
        Returns:
            ExtractedContent object or None if extraction fails
        """
        try:
            headers = {
                'User-Agent': self.user_agent,
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Language': 'en-US,en;q=0.5',
                'Accept-Encoding': 'gzip, deflate',
                'Connection': 'keep-alive',
            }
            async with aiohttp.ClientSession(
                timeout=aiohttp.ClientTimeout(total=self.timeout),
                headers=headers
            ) as session:
                async with session.get(url) as response:
                    if response.status != 200:
                        logger.warning(f"Failed to fetch {url}: HTTP {response.status}")
                        return None
                    # Check content type
                    content_type = response.headers.get('content-type', '').lower()
                    if 'text/html' not in content_type:
                        logger.warning(f"Skipping non-HTML content: {url}")
                        return None
                    html_content = await response.text()
            # Parse HTML
            soup = BeautifulSoup(html_content, 'html.parser')
            # Extract metadata
            metadata = self._extract_metadata(soup, url)
            # Extract main content
            content = self._extract_main_content(soup)
            # Limit content length
            if len(content) > self.max_content_length:
                chunks = self.text_splitter.split_text(content)
                content = chunks[0] if chunks else content[:self.max_content_length]
            # Get title
            title = metadata.get('title', '')
            if not title and soup.title:
                title = soup.title.string.strip() if soup.title.string else ""
            extracted = ExtractedContent(
                url=url,
                title=title,
                content=content,
                metadata=metadata
            )
            logger.info(f"Extracted {extracted.word_count} words from {url}")
            return extracted
        except asyncio.TimeoutError:
            logger.warning(f"Timeout extracting content from {url}")
            return None
        except Exception as e:
            logger.warning(f"Error extracting content from {url}: {e}")
            return None
    async def extract_multiple(self, urls: List[str], 
                             max_concurrent: int = 5,
                             delay_between_requests: float = 0.5) -> List[ExtractedContent]:
        """
        Extract content from multiple URLs with concurrency control
        Args:
            urls: List of URLs to extract content from
            max_concurrent: Maximum concurrent requests
            delay_between_requests: Delay between requests in seconds
        Returns:
            List of ExtractedContent objects
        """
        semaphore = asyncio.Semaphore(max_concurrent)
        results = []
        async def extract_with_semaphore(url: str) -> Optional[ExtractedContent]:
            async with semaphore:
                result = await self.extract_content(url)
                if delay_between_requests > 0:
                    await asyncio.sleep(delay_between_requests)
                return result
        # Create tasks for all URLs
        tasks = [extract_with_semaphore(url) for url in urls]
        # Execute tasks and collect results
        completed_results = await asyncio.gather(*tasks, return_exceptions=True)
        for result in completed_results:
            if isinstance(result, ExtractedContent):
                results.append(result)
            elif isinstance(result, Exception):
                logger.warning(f"Content extraction failed: {result}")
        logger.info(f"Successfully extracted content from {len(results)}/{len(urls)} URLs")
        return results
    def get_status(self) -> Dict[str, Any]:
        """Get extractor status information"""
        return {
            'timeout': self.timeout,
            'max_content_length': self.max_content_length,
            'user_agent': self.user_agent,
            'content_selectors': self.content_selectors,
            'remove_tags': list(self.remove_tags)
        }