This commit is contained in:
2025-11-07 09:05:17 +08:00
parent ec0c83702f
commit 4a4086fc77

View File

@@ -0,0 +1,620 @@
"""
智能图片匹配算法
"""
import asyncio
import logging
import re
from typing import List, Dict, Any, Optional, Tuple
import math
from collections import Counter
from ..models import ImageInfo, ImageTag
logger = logging.getLogger(__name__)
class ImageMatcher:
"""智能图片匹配器"""
def __init__(self, config: Dict[str, Any]):
self.config = config
# 匹配权重配置
self.weights = {
'keyword_match': config.get('keyword_weight', 0.4),
'tag_match': config.get('tag_weight', 0.3),
'description_match': config.get('description_weight', 0.2),
'usage_popularity': config.get('usage_weight', 0.1)
}
# 停用词列表
self.stop_words = set([
'', '', '', '', '', '', '', '', '', '', '', '', '一个',
'', '', '', '', '', '', '', '', '', '', '没有', '', '',
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of',
'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have',
'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should'
])
async def rank_images(self, query: str, images: List[ImageInfo]) -> List[ImageInfo]:
"""对图片进行智能排序"""
if not images:
return images
try:
# 提取查询关键词
query_keywords = self._extract_keywords(query)
# 计算每个图片的匹配分数
scored_images = []
for image in images:
score = await self._calculate_match_score(query_keywords, image)
scored_images.append((score, image))
# 按分数排序
scored_images.sort(key=lambda x: x[0], reverse=True)
# 返回排序后的图片列表
return [image for _, image in scored_images]
except Exception as e:
logger.error(f"Failed to rank images: {e}")
return images
async def _calculate_match_score(self, query_keywords: List[str], image: ImageInfo) -> float:
"""计算图片匹配分数"""
total_score = 0.0
try:
# 1. 关键词匹配分数
keyword_score = self._calculate_keyword_score(query_keywords, image)
total_score += keyword_score * self.weights['keyword_match']
# 2. 标签匹配分数
tag_score = self._calculate_tag_score(query_keywords, image)
total_score += tag_score * self.weights['tag_match']
# 3. 描述匹配分数
description_score = self._calculate_description_score(query_keywords, image)
total_score += description_score * self.weights['description_match']
# 4. 使用热度分数
popularity_score = self._calculate_popularity_score(image)
total_score += popularity_score * self.weights['usage_popularity']
return total_score
except Exception as e:
logger.error(f"Failed to calculate match score for image {image.image_id}: {e}")
return 0.0
def _extract_keywords(self, text: str) -> List[str]:
"""提取关键词"""
if not text:
return []
# 转换为小写
text = text.lower()
# 使用正则表达式提取单词
words = re.findall(r'\b\w+\b', text)
# 过滤停用词和短词
keywords = [
word for word in words
if word not in self.stop_words and len(word) > 1
]
return keywords
def _calculate_keyword_score(self, query_keywords: List[str], image: ImageInfo) -> float:
"""计算关键词匹配分数"""
if not query_keywords:
return 0.0
# 获取图片的所有关键词
image_keywords = [kw.lower() for kw in image.keywords]
# 计算匹配的关键词数量
matches = 0
for query_kw in query_keywords:
for image_kw in image_keywords:
if query_kw in image_kw or image_kw in query_kw:
matches += 1
break
# 计算匹配率
match_ratio = matches / len(query_keywords)
return match_ratio
def _calculate_tag_score(self, query_keywords: List[str], image: ImageInfo) -> float:
"""计算标签匹配分数"""
if not query_keywords or not image.tags:
return 0.0
# 计算匹配分数,考虑标签置信度
total_score = 0.0
total_weight = 0.0
for tag in image.tags:
tag_name = tag.name.lower()
tag_confidence = tag.confidence
# 检查标签是否与查询关键词匹配
match_score = 0.0
for query_kw in query_keywords:
if query_kw in tag_name or tag_name in query_kw:
match_score = 1.0
break
elif self._calculate_similarity(query_kw, tag_name) > 0.7:
match_score = 0.8
total_score += match_score * tag_confidence
total_weight += tag_confidence
return total_score / total_weight if total_weight > 0 else 0.0
def _calculate_description_score(self, query_keywords: List[str], image: ImageInfo) -> float:
"""计算描述匹配分数"""
if not query_keywords:
return 0.0
# 合并标题、描述和alt文本
text_content = []
if image.title:
text_content.append(image.title)
if image.description:
text_content.append(image.description)
if image.alt_text:
text_content.append(image.alt_text)
if not text_content:
return 0.0
# 提取描述中的关键词
description_text = ' '.join(text_content).lower()
description_keywords = self._extract_keywords(description_text)
# 计算TF-IDF相似度
return self._calculate_tfidf_similarity(query_keywords, description_keywords)
def _calculate_popularity_score(self, image: ImageInfo) -> float:
"""计算使用热度分数"""
# 基于使用次数和最近使用时间计算热度
usage_count = image.usage_count
# 使用对数缩放避免热门图片过度占优
usage_score = math.log(usage_count + 1) / math.log(100) # 归一化到0-1
return min(usage_score, 1.0)
def _calculate_similarity(self, word1: str, word2: str) -> float:
"""计算两个词的相似度(简单的编辑距离)"""
if not word1 or not word2:
return 0.0
# 计算编辑距离
m, n = len(word1), len(word2)
dp = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(m + 1):
dp[i][0] = i
for j in range(n + 1):
dp[0][j] = j
for i in range(1, m + 1):
for j in range(1, n + 1):
if word1[i-1] == word2[j-1]:
dp[i][j] = dp[i-1][j-1]
else:
dp[i][j] = min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) + 1
# 转换为相似度
max_len = max(m, n)
similarity = 1.0 - (dp[m][n] / max_len) if max_len > 0 else 0.0
return similarity
def _calculate_tfidf_similarity(self, query_keywords: List[str], doc_keywords: List[str]) -> float:
"""计算TF-IDF相似度"""
if not query_keywords or not doc_keywords:
return 0.0
# 简化的TF-IDF计算
query_counter = Counter(query_keywords)
doc_counter = Counter(doc_keywords)
# 计算交集
common_keywords = set(query_keywords) & set(doc_keywords)
if not common_keywords:
return 0.0
# 计算相似度
similarity = 0.0
for keyword in common_keywords:
query_tf = query_counter[keyword] / len(query_keywords)
doc_tf = doc_counter[keyword] / len(doc_keywords)
similarity += query_tf * doc_tf
return similarity
async def suggest_images_for_content(self,
content: str,
available_images: List[ImageInfo],
max_suggestions: int = 5) -> List[ImageInfo]:
"""为内容推荐图片"""
try:
# 分析内容,提取关键信息
content_keywords = self._extract_keywords(content)
# 识别内容类型和主题
content_type = self._identify_content_type(content)
content_theme = self._identify_content_theme(content)
# 过滤相关图片
relevant_images = []
for image in available_images:
relevance_score = await self._calculate_content_relevance(
content_keywords, content_type, content_theme, image
)
if relevance_score > 0.1: # 设置最低相关度阈值
relevant_images.append((relevance_score, image))
# 排序并返回前N个
relevant_images.sort(key=lambda x: x[0], reverse=True)
return [image for _, image in relevant_images[:max_suggestions]]
except Exception as e:
logger.error(f"Failed to suggest images for content: {e}")
return []
def _identify_content_type(self, content: str) -> str:
"""识别内容类型"""
content_lower = content.lower()
# 简单的关键词匹配
if any(word in content_lower for word in ['数据', '统计', '图表', '分析', 'data', 'chart', 'graph']):
return 'data'
elif any(word in content_lower for word in ['技术', '科技', '创新', 'technology', 'innovation']):
return 'technology'
elif any(word in content_lower for word in ['商业', '业务', '市场', 'business', 'market']):
return 'business'
elif any(word in content_lower for word in ['教育', '学习', '培训', 'education', 'learning']):
return 'education'
else:
return 'general'
def _identify_content_theme(self, content: str) -> str:
"""识别内容主题"""
content_lower = content.lower()
# 主题关键词映射
themes = {
'success': ['成功', '成就', '胜利', 'success', 'achievement', 'victory'],
'growth': ['增长', '发展', '提升', 'growth', 'development', 'improvement'],
'teamwork': ['团队', '合作', '协作', 'team', 'cooperation', 'collaboration'],
'innovation': ['创新', '创意', '新颖', 'innovation', 'creative', 'novel'],
'challenge': ['挑战', '困难', '问题', 'challenge', 'difficulty', 'problem'],
'future': ['未来', '前景', '展望', 'future', 'prospect', 'outlook']
}
for theme, keywords in themes.items():
if any(keyword in content_lower for keyword in keywords):
return theme
return 'neutral'
async def _calculate_content_relevance(self,
content_keywords: List[str],
content_type: str,
content_theme: str,
image: ImageInfo) -> float:
"""计算图片与内容的相关度"""
relevance_score = 0.0
# 基础关键词匹配
keyword_score = self._calculate_keyword_score(content_keywords, image)
relevance_score += keyword_score * 0.4
# 标签匹配
tag_score = self._calculate_tag_score(content_keywords, image)
relevance_score += tag_score * 0.3
# 内容类型匹配
type_score = self._calculate_type_match(content_type, image)
relevance_score += type_score * 0.2
# 主题匹配
theme_score = self._calculate_theme_match(content_theme, image)
relevance_score += theme_score * 0.1
return relevance_score
def _calculate_type_match(self, content_type: str, image: ImageInfo) -> float:
"""计算内容类型匹配度"""
# 根据图片标签判断类型匹配
type_keywords = {
'data': ['chart', 'graph', 'data', 'statistics', '图表', '数据'],
'technology': ['tech', 'computer', 'digital', '科技', '技术'],
'business': ['business', 'office', 'meeting', '商业', '办公'],
'education': ['education', 'learning', 'book', '教育', '学习']
}
if content_type not in type_keywords:
return 0.5 # 中性分数
type_words = type_keywords[content_type]
image_tags = [tag.name.lower() for tag in image.tags]
image_keywords = [kw.lower() for kw in image.keywords]
matches = 0
total_checks = len(type_words)
for word in type_words:
if any(word in tag for tag in image_tags) or any(word in kw for kw in image_keywords):
matches += 1
return matches / total_checks if total_checks > 0 else 0.0
def _calculate_theme_match(self, content_theme: str, image: ImageInfo) -> float:
"""计算主题匹配度"""
# 根据图片的情感色彩和主题标签判断
theme_keywords = {
'success': ['success', 'winner', 'achievement', '成功', '胜利'],
'growth': ['growth', 'arrow', 'up', '增长', '上升'],
'teamwork': ['team', 'group', 'together', '团队', '合作'],
'innovation': ['innovation', 'creative', 'new', '创新', '创意'],
'challenge': ['challenge', 'difficult', 'problem', '挑战', '困难'],
'future': ['future', 'tomorrow', 'next', '未来', '明天']
}
if content_theme not in theme_keywords:
return 0.5 # 中性分数
theme_words = theme_keywords[content_theme]
image_tags = [tag.name.lower() for tag in image.tags]
image_description = (image.description or '').lower()
matches = 0
for word in theme_words:
if any(word in tag for tag in image_tags) or word in image_description:
matches += 1
return min(matches / len(theme_words), 1.0) if theme_words else 0.0
def _calculate_description_score(self, query_keywords: List[str], image: ImageInfo) -> float:
"""计算描述匹配分数"""
if not query_keywords:
return 0.0
# 合并标题、描述和alt文本
text_content = []
if image.title:
text_content.append(image.title)
if image.description:
text_content.append(image.description)
if image.alt_text:
text_content.append(image.alt_text)
if not text_content:
return 0.0
# 提取描述中的关键词
description_text = ' '.join(text_content).lower()
description_keywords = self._extract_keywords(description_text)
# 计算TF-IDF相似度
return self._calculate_tfidf_similarity(query_keywords, description_keywords)
def _calculate_popularity_score(self, image: ImageInfo) -> float:
"""计算使用热度分数"""
# 基于使用次数和最近使用时间计算热度
usage_count = image.usage_count
# 使用对数缩放避免热门图片过度占优
usage_score = math.log(usage_count + 1) / math.log(100) # 归一化到0-1
# 考虑最近使用时间(可选)
# 这里简化处理,只考虑使用次数
return min(usage_score, 1.0)
def _calculate_similarity(self, word1: str, word2: str) -> float:
"""计算两个词的相似度(简单的编辑距离)"""
if not word1 or not word2:
return 0.0
# 计算编辑距离
m, n = len(word1), len(word2)
dp = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(m + 1):
dp[i][0] = i
for j in range(n + 1):
dp[0][j] = j
for i in range(1, m + 1):
for j in range(1, n + 1):
if word1[i-1] == word2[j-1]:
dp[i][j] = dp[i-1][j-1]
else:
dp[i][j] = min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) + 1
# 转换为相似度
max_len = max(m, n)
similarity = 1.0 - (dp[m][n] / max_len) if max_len > 0 else 0.0
return similarity
def _calculate_tfidf_similarity(self, query_keywords: List[str], doc_keywords: List[str]) -> float:
"""计算TF-IDF相似度"""
if not query_keywords or not doc_keywords:
return 0.0
# 简化的TF-IDF计算
query_counter = Counter(query_keywords)
doc_counter = Counter(doc_keywords)
# 计算交集
common_keywords = set(query_keywords) & set(doc_keywords)
if not common_keywords:
return 0.0
# 计算相似度
similarity = 0.0
for keyword in common_keywords:
query_tf = query_counter[keyword] / len(query_keywords)
doc_tf = doc_counter[keyword] / len(doc_keywords)
similarity += query_tf * doc_tf
return similarity
async def suggest_images_for_content(self,
content: str,
available_images: List[ImageInfo],
max_suggestions: int = 5) -> List[ImageInfo]:
"""为内容推荐图片"""
try:
# 分析内容,提取关键信息
content_keywords = self._extract_keywords(content)
# 识别内容类型和主题
content_type = self._identify_content_type(content)
content_theme = self._identify_content_theme(content)
# 过滤相关图片
relevant_images = []
for image in available_images:
relevance_score = await self._calculate_content_relevance(
content_keywords, content_type, content_theme, image
)
if relevance_score > 0.1: # 设置最低相关度阈值
relevant_images.append((relevance_score, image))
# 排序并返回前N个
relevant_images.sort(key=lambda x: x[0], reverse=True)
return [image for _, image in relevant_images[:max_suggestions]]
except Exception as e:
logger.error(f"Failed to suggest images for content: {e}")
return []
def _identify_content_type(self, content: str) -> str:
"""识别内容类型"""
content_lower = content.lower()
# 简单的关键词匹配
if any(word in content_lower for word in ['数据', '统计', '图表', '分析', 'data', 'chart', 'graph']):
return 'data'
elif any(word in content_lower for word in ['技术', '科技', '创新', 'technology', 'innovation']):
return 'technology'
elif any(word in content_lower for word in ['商业', '业务', '市场', 'business', 'market']):
return 'business'
elif any(word in content_lower for word in ['教育', '学习', '培训', 'education', 'learning']):
return 'education'
else:
return 'general'
def _identify_content_theme(self, content: str) -> str:
"""识别内容主题"""
content_lower = content.lower()
# 主题关键词映射
themes = {
'success': ['成功', '成就', '胜利', 'success', 'achievement', 'victory'],
'growth': ['增长', '发展', '提升', 'growth', 'development', 'improvement'],
'teamwork': ['团队', '合作', '协作', 'team', 'cooperation', 'collaboration'],
'innovation': ['创新', '创意', '新颖', 'innovation', 'creative', 'novel'],
'challenge': ['挑战', '困难', '问题', 'challenge', 'difficulty', 'problem'],
'future': ['未来', '前景', '展望', 'future', 'prospect', 'outlook']
}
for theme, keywords in themes.items():
if any(keyword in content_lower for keyword in keywords):
return theme
return 'neutral'
async def _calculate_content_relevance(self,
content_keywords: List[str],
content_type: str,
content_theme: str,
image: ImageInfo) -> float:
"""计算图片与内容的相关度"""
relevance_score = 0.0
# 基础关键词匹配
keyword_score = self._calculate_keyword_score(content_keywords, image)
relevance_score += keyword_score * 0.4
# 标签匹配
tag_score = self._calculate_tag_score(content_keywords, image)
relevance_score += tag_score * 0.3
# 内容类型匹配
type_score = self._calculate_type_match(content_type, image)
relevance_score += type_score * 0.2
# 主题匹配
theme_score = self._calculate_theme_match(content_theme, image)
relevance_score += theme_score * 0.1
return relevance_score
def _calculate_type_match(self, content_type: str, image: ImageInfo) -> float:
"""计算内容类型匹配度"""
# 根据图片标签判断类型匹配
type_keywords = {
'data': ['chart', 'graph', 'data', 'statistics', '图表', '数据'],
'technology': ['tech', 'computer', 'digital', '科技', '技术'],
'business': ['business', 'office', 'meeting', '商业', '办公'],
'education': ['education', 'learning', 'book', '教育', '学习']
}
if content_type not in type_keywords:
return 0.5 # 中性分数
type_words = type_keywords[content_type]
image_tags = [tag.name.lower() for tag in image.tags]
image_keywords = [kw.lower() for kw in image.keywords]
matches = 0
total_checks = len(type_words)
for word in type_words:
if any(word in tag for tag in image_tags) or any(word in kw for kw in image_keywords):
matches += 1
return matches / total_checks if total_checks > 0 else 0.0
def _calculate_theme_match(self, content_theme: str, image: ImageInfo) -> float:
"""计算主题匹配度"""
# 根据图片的情感色彩和主题标签判断
theme_keywords = {
'success': ['success', 'winner', 'achievement', '成功', '胜利'],
'growth': ['growth', 'arrow', 'up', '增长', '上升'],
'teamwork': ['team', 'group', 'together', '团队', '合作'],
'innovation': ['innovation', 'creative', 'new', '创新', '创意'],
'challenge': ['challenge', 'difficult', 'problem', '挑战', '困难'],
'future': ['future', 'tomorrow', 'next', '未来', '明天']
}
if content_theme not in theme_keywords:
return 0.5 # 中性分数
theme_words = theme_keywords[content_theme]
image_tags = [tag.name.lower() for tag in image.tags]
image_description = (image.description or '').lower()
matches = 0
for word in theme_words:
if any(word in tag for tag in image_tags) or word in image_description:
matches += 1
return min(matches / len(theme_words), 1.0) if theme_words else 0.0