236 lines
5.5 KiB
Go
236 lines
5.5 KiB
Go
package rss
|
||
|
||
import (
|
||
"crypto/md5"
|
||
"encoding/xml"
|
||
"fmt"
|
||
"io"
|
||
"net/http"
|
||
"sort"
|
||
"strings"
|
||
"time"
|
||
)
|
||
|
||
func CheckRssFeed(feedURL string) error {
|
||
//确认返回头
|
||
resp, err := http.Head(feedURL)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
if resp.StatusCode != 200 {
|
||
return fmt.Errorf("RSS源无效: %d", resp.StatusCode)
|
||
}
|
||
contentType := resp.Header.Get("Content-Type")
|
||
// 支持多种RSS/Atom的Content-Type
|
||
validContentTypes := []string{
|
||
"application/rss+xml",
|
||
"application/atom+xml",
|
||
"application/xml",
|
||
"text/xml",
|
||
}
|
||
|
||
isValid := false
|
||
for _, validType := range validContentTypes {
|
||
if strings.Contains(contentType, validType) {
|
||
isValid = true
|
||
break
|
||
}
|
||
}
|
||
|
||
if !isValid {
|
||
return fmt.Errorf("RSS源无效: %s", resp.Header.Get("Content-Type"))
|
||
}
|
||
return nil
|
||
}
|
||
|
||
func ParseRssFeed(feedURL string) ([]RssItem, error) {
|
||
//确认大小
|
||
resp, err := http.Head(feedURL)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if resp.StatusCode != 200 {
|
||
return nil, fmt.Errorf("RSS源无效: %d", resp.StatusCode)
|
||
}
|
||
if resp.ContentLength == 0 || resp.ContentLength > 1024*1024*10 {
|
||
return nil, fmt.Errorf("RSS源的大小为%d,超出限制", resp.ContentLength)
|
||
}
|
||
|
||
//获取rss数据
|
||
resp, err = http.Get(feedURL)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
defer resp.Body.Close()
|
||
|
||
// 读取响应体内容
|
||
body, err := io.ReadAll(resp.Body)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("读取RSS数据失败: %v", err)
|
||
}
|
||
|
||
// 首先尝试解析为RSS格式
|
||
items, err := parseRSSFormat(body)
|
||
if err == nil {
|
||
return items, nil
|
||
}
|
||
|
||
// 如果RSS格式解析失败,尝试解析为Atom格式
|
||
items, err = parseAtomFormat(body)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("解析RSS/Atom数据失败: %v", err)
|
||
}
|
||
|
||
return items, nil
|
||
}
|
||
|
||
// 解析RSS格式
|
||
func parseRSSFormat(data []byte) ([]RssItem, error) {
|
||
var rssFeed RSSFeed
|
||
decoder := xml.NewDecoder(strings.NewReader(string(data)))
|
||
decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
|
||
// 处理不同的字符编码
|
||
switch charset {
|
||
case "GB2312", "GBK", "GB18030":
|
||
// 如果需要处理中文编码,可以在这里添加转换逻辑
|
||
return input, nil
|
||
default:
|
||
return input, nil
|
||
}
|
||
}
|
||
|
||
if err := decoder.Decode(&rssFeed); err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
// 转换为RssItem数组
|
||
var items []RssItem
|
||
for _, item := range rssFeed.Channel.Items {
|
||
rssItem := RssItem{
|
||
Title: item.Title,
|
||
Link: item.Link,
|
||
Description: item.Description,
|
||
GUID: item.GUID,
|
||
Author: item.Author,
|
||
Category: item.Category,
|
||
}
|
||
|
||
// 解析发布时间
|
||
if item.PubDate != "" {
|
||
rssItem.PubDate = parseTimeString(item.PubDate)
|
||
}
|
||
|
||
// 生成内容哈希值
|
||
content := fmt.Sprintf("%s%s%s", item.Title, item.Link, item.Description)
|
||
hash := md5.Sum([]byte(content))
|
||
rssItem.Hash = fmt.Sprintf("%x", hash)
|
||
|
||
items = append(items, rssItem)
|
||
}
|
||
|
||
//按时间降序排序
|
||
sort.Slice(items, func(i, j int) bool {
|
||
return items[i].PubDate.After(items[j].PubDate)
|
||
})
|
||
|
||
return items, nil
|
||
}
|
||
|
||
// 解析Atom格式
|
||
func parseAtomFormat(data []byte) ([]RssItem, error) {
|
||
var atomFeed AtomFeed
|
||
decoder := xml.NewDecoder(strings.NewReader(string(data)))
|
||
decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
|
||
// 处理不同的字符编码
|
||
switch charset {
|
||
case "GB2312", "GBK", "GB18030":
|
||
// 如果需要处理中文编码,可以在这里添加转换逻辑
|
||
return input, nil
|
||
default:
|
||
return input, nil
|
||
}
|
||
}
|
||
|
||
if err := decoder.Decode(&atomFeed); err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
// 转换为RssItem数组
|
||
var items []RssItem
|
||
for _, entry := range atomFeed.Entries {
|
||
rssItem := RssItem{
|
||
Title: entry.Title,
|
||
GUID: entry.ID,
|
||
Author: entry.Author.Name,
|
||
}
|
||
|
||
// 获取链接
|
||
for _, link := range entry.Link {
|
||
if link.Rel == "" || link.Rel == "alternate" {
|
||
rssItem.Link = link.Href
|
||
break
|
||
}
|
||
}
|
||
|
||
// 获取描述内容
|
||
if entry.Content.Value != "" {
|
||
rssItem.Description = entry.Content.Value
|
||
} else if entry.Summary != "" {
|
||
rssItem.Description = entry.Summary
|
||
}
|
||
|
||
// 获取分类
|
||
if len(entry.Category) > 0 {
|
||
rssItem.Category = entry.Category[0].Term
|
||
}
|
||
|
||
// 解析发布时间
|
||
timeStr := entry.Published
|
||
if timeStr == "" {
|
||
timeStr = entry.Updated
|
||
}
|
||
if timeStr != "" {
|
||
rssItem.PubDate = parseTimeString(timeStr)
|
||
}
|
||
|
||
// 生成内容哈希值
|
||
content := fmt.Sprintf("%s%s%s", rssItem.Title, rssItem.Link, rssItem.Description)
|
||
hash := md5.Sum([]byte(content))
|
||
rssItem.Hash = fmt.Sprintf("%x", hash)
|
||
|
||
items = append(items, rssItem)
|
||
}
|
||
|
||
//按时间降序排序
|
||
sort.Slice(items, func(i, j int) bool {
|
||
return items[i].PubDate.After(items[j].PubDate)
|
||
})
|
||
|
||
return items, nil
|
||
}
|
||
|
||
// 解析时间字符串的辅助函数
|
||
func parseTimeString(timeStr string) time.Time {
|
||
// 尝试多种时间格式
|
||
timeFormats := []string{
|
||
time.RFC3339, // "2006-01-02T15:04:05Z07:00" (ISO 8601, Atom常用)
|
||
time.RFC3339Nano, // "2006-01-02T15:04:05.999999999Z07:00"
|
||
time.RFC1123, // "Mon, 02 Jan 2006 15:04:05 MST" (RSS常用)
|
||
time.RFC1123Z, // "Mon, 02 Jan 2006 15:04:05 -0700"
|
||
time.RFC822, // "02 Jan 06 15:04 MST"
|
||
time.RFC822Z, // "02 Jan 06 15:04 -0700"
|
||
"2006-01-02T15:04:05Z", // UTC时间
|
||
"2006-01-02T15:04:05.000Z", // 带毫秒的UTC时间
|
||
"2006-01-02 15:04:05", // 简单格式
|
||
}
|
||
|
||
for _, format := range timeFormats {
|
||
if parsedTime, err := time.Parse(format, timeStr); err == nil {
|
||
return parsedTime
|
||
}
|
||
}
|
||
|
||
// 如果所有格式都失败,返回零时间
|
||
return time.Time{}
|
||
}
|