qq_bot/handler/rss/parse.go

241 lines
5.7 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package rss
import (
"crypto/md5"
"encoding/xml"
"errors"
"fmt"
"io"
"net/http"
"sort"
"strings"
"time"
)
func CheckRssFeed(feedURL string) error {
//确认返回头
resp, err := http.Head(feedURL)
if err != nil {
return err
}
if resp.StatusCode != 200 {
return fmt.Errorf("RSS源无效: %d", resp.StatusCode)
}
contentType := resp.Header.Get("Content-Type")
// 支持多种RSS/Atom的Content-Type
validContentTypes := []string{
"application/rss+xml",
"application/atom+xml",
"application/xml",
"text/xml",
}
isValid := false
for _, validType := range validContentTypes {
if strings.Contains(contentType, validType) {
isValid = true
break
}
}
if !isValid {
return fmt.Errorf("RSS源无效: %s", resp.Header.Get("Content-Type"))
}
return nil
}
func ParseFeed(feedURL string) (string, []RssItem, error) {
//确认大小
resp, err := http.Head(feedURL)
if err != nil {
return "", nil, err
}
if resp.StatusCode != 200 {
return "", nil, fmt.Errorf("RSS源无效: %d", resp.StatusCode)
}
if resp.ContentLength == 0 || resp.ContentLength > 1024*1024*10 {
return "", nil, fmt.Errorf("RSS源的大小为%d,超出限制", resp.ContentLength)
}
//获取rss数据
resp, err = http.Get(feedURL)
if err != nil {
return "", nil, err
}
defer resp.Body.Close()
// 读取响应体内容
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", nil, fmt.Errorf("读取RSS数据失败: %v", err)
}
// 首先尝试解析为RSS格式
title, items, err := parseRSSFormat(body)
if err == nil {
return title, items, nil
}
// 如果RSS格式解析失败尝试解析为Atom格式
title, items, err = parseAtomFormat(body)
if err != nil {
return "", nil, fmt.Errorf("解析RSS/Atom数据失败: %v", err)
}
if len(items) == 0 {
return title, nil, errors.New("未解析到rss信息")
}
return title, items, nil
}
// 解析RSS格式
func parseRSSFormat(data []byte) (string, []RssItem, error) {
var rssFeed RSSFeed
decoder := xml.NewDecoder(strings.NewReader(string(data)))
decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
// 处理不同的字符编码
switch charset {
case "GB2312", "GBK", "GB18030":
// 如果需要处理中文编码,可以在这里添加转换逻辑
return input, nil
default:
return input, nil
}
}
if err := decoder.Decode(&rssFeed); err != nil {
return "", nil, err
}
// 转换为RssItem数组
var items []RssItem
for _, item := range rssFeed.Channel.Items {
rssItem := RssItem{
Title: item.Title,
Link: item.Link,
Description: item.Description,
GUID: item.GUID,
Author: item.Author,
Category: item.Category,
}
// 解析发布时间
if item.PubDate != "" {
rssItem.PubDate = parseTimeString(item.PubDate)
}
// 生成内容哈希值
content := fmt.Sprintf("%s%s%s", item.Title, item.Link, item.Description)
hash := md5.Sum([]byte(content))
rssItem.Hash = fmt.Sprintf("%x", hash)
items = append(items, rssItem)
}
//按时间降序排序
sort.Slice(items, func(i, j int) bool {
return items[i].PubDate.After(items[j].PubDate)
})
return rssFeed.Channel.Title, items, nil
}
// 解析Atom格式
func parseAtomFormat(data []byte) (string, []RssItem, error) {
var atomFeed AtomFeed
decoder := xml.NewDecoder(strings.NewReader(string(data)))
decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
// 处理不同的字符编码
switch charset {
case "GB2312", "GBK", "GB18030":
// 如果需要处理中文编码,可以在这里添加转换逻辑
return input, nil
default:
return input, nil
}
}
if err := decoder.Decode(&atomFeed); err != nil {
return "", nil, err
}
// 转换为RssItem数组
var items []RssItem
for _, entry := range atomFeed.Entries {
rssItem := RssItem{
Title: entry.Title,
GUID: entry.ID,
Author: entry.Author.Name,
}
// 获取链接
for _, link := range entry.Link {
if link.Rel == "" || link.Rel == "alternate" {
rssItem.Link = link.Href
break
}
}
// 获取描述内容
if entry.Content.Value != "" {
rssItem.Description = entry.Content.Value
} else if entry.Summary != "" {
rssItem.Description = entry.Summary
}
// 获取分类
if len(entry.Category) > 0 {
rssItem.Category = entry.Category[0].Term
}
// 解析发布时间
timeStr := entry.Published
if timeStr == "" {
timeStr = entry.Updated
}
if timeStr != "" {
rssItem.PubDate = parseTimeString(timeStr)
}
// 生成内容哈希值
content := fmt.Sprintf("%s%s%s", rssItem.Title, rssItem.Link, rssItem.Description)
hash := md5.Sum([]byte(content))
rssItem.Hash = fmt.Sprintf("%x", hash)
items = append(items, rssItem)
}
//按时间降序排序
sort.Slice(items, func(i, j int) bool {
return items[i].PubDate.After(items[j].PubDate)
})
return atomFeed.Title, items, nil
}
// 解析时间字符串的辅助函数
func parseTimeString(timeStr string) time.Time {
// 尝试多种时间格式
timeFormats := []string{
time.RFC3339, // "2006-01-02T15:04:05Z07:00" (ISO 8601, Atom常用)
time.RFC3339Nano, // "2006-01-02T15:04:05.999999999Z07:00"
time.RFC1123, // "Mon, 02 Jan 2006 15:04:05 MST" (RSS常用)
time.RFC1123Z, // "Mon, 02 Jan 2006 15:04:05 -0700"
time.RFC822, // "02 Jan 06 15:04 MST"
time.RFC822Z, // "02 Jan 06 15:04 -0700"
"2006-01-02T15:04:05Z", // UTC时间
"2006-01-02T15:04:05.000Z", // 带毫秒的UTC时间
"2006-01-02 15:04:05", // 简单格式
}
for _, format := range timeFormats {
if parsedTime, err := time.Parse(format, timeStr); err == nil {
return parsedTime
}
}
// 如果所有格式都失败,返回零时间
return time.Time{}
}