qq_bot/handler/rss/parse.go

236 lines
5.5 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package rss
import (
"crypto/md5"
"encoding/xml"
"fmt"
"io"
"net/http"
"sort"
"strings"
"time"
)
func CheckRssFeed(feedURL string) error {
//确认返回头
resp, err := http.Head(feedURL)
if err != nil {
return err
}
if resp.StatusCode != 200 {
return fmt.Errorf("RSS源无效: %d", resp.StatusCode)
}
contentType := resp.Header.Get("Content-Type")
// 支持多种RSS/Atom的Content-Type
validContentTypes := []string{
"application/rss+xml",
"application/atom+xml",
"application/xml",
"text/xml",
}
isValid := false
for _, validType := range validContentTypes {
if strings.Contains(contentType, validType) {
isValid = true
break
}
}
if !isValid {
return fmt.Errorf("RSS源无效: %s", resp.Header.Get("Content-Type"))
}
return nil
}
func ParseRssFeed(feedURL string) ([]RssItem, error) {
//确认大小
resp, err := http.Head(feedURL)
if err != nil {
return nil, err
}
if resp.StatusCode != 200 {
return nil, fmt.Errorf("RSS源无效: %d", resp.StatusCode)
}
if resp.ContentLength == 0 || resp.ContentLength > 1024*1024*10 {
return nil, fmt.Errorf("RSS源的大小为%d,超出限制", resp.ContentLength)
}
//获取rss数据
resp, err = http.Get(feedURL)
if err != nil {
return nil, err
}
defer resp.Body.Close()
// 读取响应体内容
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("读取RSS数据失败: %v", err)
}
// 首先尝试解析为RSS格式
items, err := parseRSSFormat(body)
if err == nil {
return items, nil
}
// 如果RSS格式解析失败尝试解析为Atom格式
items, err = parseAtomFormat(body)
if err != nil {
return nil, fmt.Errorf("解析RSS/Atom数据失败: %v", err)
}
return items, nil
}
// 解析RSS格式
func parseRSSFormat(data []byte) ([]RssItem, error) {
var rssFeed RSSFeed
decoder := xml.NewDecoder(strings.NewReader(string(data)))
decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
// 处理不同的字符编码
switch charset {
case "GB2312", "GBK", "GB18030":
// 如果需要处理中文编码,可以在这里添加转换逻辑
return input, nil
default:
return input, nil
}
}
if err := decoder.Decode(&rssFeed); err != nil {
return nil, err
}
// 转换为RssItem数组
var items []RssItem
for _, item := range rssFeed.Channel.Items {
rssItem := RssItem{
Title: item.Title,
Link: item.Link,
Description: item.Description,
GUID: item.GUID,
Author: item.Author,
Category: item.Category,
}
// 解析发布时间
if item.PubDate != "" {
rssItem.PubDate = parseTimeString(item.PubDate)
}
// 生成内容哈希值
content := fmt.Sprintf("%s%s%s", item.Title, item.Link, item.Description)
hash := md5.Sum([]byte(content))
rssItem.Hash = fmt.Sprintf("%x", hash)
items = append(items, rssItem)
}
//按时间降序排序
sort.Slice(items, func(i, j int) bool {
return items[i].PubDate.After(items[j].PubDate)
})
return items, nil
}
// 解析Atom格式
func parseAtomFormat(data []byte) ([]RssItem, error) {
var atomFeed AtomFeed
decoder := xml.NewDecoder(strings.NewReader(string(data)))
decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
// 处理不同的字符编码
switch charset {
case "GB2312", "GBK", "GB18030":
// 如果需要处理中文编码,可以在这里添加转换逻辑
return input, nil
default:
return input, nil
}
}
if err := decoder.Decode(&atomFeed); err != nil {
return nil, err
}
// 转换为RssItem数组
var items []RssItem
for _, entry := range atomFeed.Entries {
rssItem := RssItem{
Title: entry.Title,
GUID: entry.ID,
Author: entry.Author.Name,
}
// 获取链接
for _, link := range entry.Link {
if link.Rel == "" || link.Rel == "alternate" {
rssItem.Link = link.Href
break
}
}
// 获取描述内容
if entry.Content.Value != "" {
rssItem.Description = entry.Content.Value
} else if entry.Summary != "" {
rssItem.Description = entry.Summary
}
// 获取分类
if len(entry.Category) > 0 {
rssItem.Category = entry.Category[0].Term
}
// 解析发布时间
timeStr := entry.Published
if timeStr == "" {
timeStr = entry.Updated
}
if timeStr != "" {
rssItem.PubDate = parseTimeString(timeStr)
}
// 生成内容哈希值
content := fmt.Sprintf("%s%s%s", rssItem.Title, rssItem.Link, rssItem.Description)
hash := md5.Sum([]byte(content))
rssItem.Hash = fmt.Sprintf("%x", hash)
items = append(items, rssItem)
}
//按时间降序排序
sort.Slice(items, func(i, j int) bool {
return items[i].PubDate.After(items[j].PubDate)
})
return items, nil
}
// 解析时间字符串的辅助函数
func parseTimeString(timeStr string) time.Time {
// 尝试多种时间格式
timeFormats := []string{
time.RFC3339, // "2006-01-02T15:04:05Z07:00" (ISO 8601, Atom常用)
time.RFC3339Nano, // "2006-01-02T15:04:05.999999999Z07:00"
time.RFC1123, // "Mon, 02 Jan 2006 15:04:05 MST" (RSS常用)
time.RFC1123Z, // "Mon, 02 Jan 2006 15:04:05 -0700"
time.RFC822, // "02 Jan 06 15:04 MST"
time.RFC822Z, // "02 Jan 06 15:04 -0700"
"2006-01-02T15:04:05Z", // UTC时间
"2006-01-02T15:04:05.000Z", // 带毫秒的UTC时间
"2006-01-02 15:04:05", // 简单格式
}
for _, format := range timeFormats {
if parsedTime, err := time.Parse(format, timeStr); err == nil {
return parsedTime
}
}
// 如果所有格式都失败,返回零时间
return time.Time{}
}