package rss import ( "crypto/md5" "encoding/xml" "fmt" "io" "net/http" "sort" "strings" "time" ) func CheckRssFeed(feedURL string) error { //确认返回头 resp, err := http.Head(feedURL) if err != nil { return err } if resp.StatusCode != 200 { return fmt.Errorf("RSS源无效: %d", resp.StatusCode) } contentType := resp.Header.Get("Content-Type") // 支持多种RSS/Atom的Content-Type validContentTypes := []string{ "application/rss+xml", "application/atom+xml", "application/xml", "text/xml", } isValid := false for _, validType := range validContentTypes { if strings.Contains(contentType, validType) { isValid = true break } } if !isValid { return fmt.Errorf("RSS源无效: %s", resp.Header.Get("Content-Type")) } return nil } func ParseRssFeed(feedURL string) (string, []RssItem, error) { //确认大小 resp, err := http.Head(feedURL) if err != nil { return "", nil, err } if resp.StatusCode != 200 { return "", nil, fmt.Errorf("RSS源无效: %d", resp.StatusCode) } if resp.ContentLength == 0 || resp.ContentLength > 1024*1024*10 { return "", nil, fmt.Errorf("RSS源的大小为%d,超出限制", resp.ContentLength) } //获取rss数据 resp, err = http.Get(feedURL) if err != nil { return "", nil, err } defer resp.Body.Close() // 读取响应体内容 body, err := io.ReadAll(resp.Body) if err != nil { return "", nil, fmt.Errorf("读取RSS数据失败: %v", err) } // 首先尝试解析为RSS格式 title, items, err := parseRSSFormat(body) if err == nil { return title, items, nil } // 如果RSS格式解析失败,尝试解析为Atom格式 title, items, err = parseAtomFormat(body) if err != nil { return "", nil, fmt.Errorf("解析RSS/Atom数据失败: %v", err) } return title, items, nil } // 解析RSS格式 func parseRSSFormat(data []byte) (string, []RssItem, error) { var rssFeed RSSFeed decoder := xml.NewDecoder(strings.NewReader(string(data))) decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) { // 处理不同的字符编码 switch charset { case "GB2312", "GBK", "GB18030": // 如果需要处理中文编码,可以在这里添加转换逻辑 return input, nil default: return input, nil } } if err := decoder.Decode(&rssFeed); err != nil { return "", nil, err } // 转换为RssItem数组 var items []RssItem for _, item := range rssFeed.Channel.Items { rssItem := RssItem{ Title: item.Title, Link: item.Link, Description: item.Description, GUID: item.GUID, Author: item.Author, Category: item.Category, } // 解析发布时间 if item.PubDate != "" { rssItem.PubDate = parseTimeString(item.PubDate) } // 生成内容哈希值 content := fmt.Sprintf("%s%s%s", item.Title, item.Link, item.Description) hash := md5.Sum([]byte(content)) rssItem.Hash = fmt.Sprintf("%x", hash) items = append(items, rssItem) } //按时间降序排序 sort.Slice(items, func(i, j int) bool { return items[i].PubDate.After(items[j].PubDate) }) return rssFeed.Channel.Title, items, nil } // 解析Atom格式 func parseAtomFormat(data []byte) (string, []RssItem, error) { var atomFeed AtomFeed decoder := xml.NewDecoder(strings.NewReader(string(data))) decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) { // 处理不同的字符编码 switch charset { case "GB2312", "GBK", "GB18030": // 如果需要处理中文编码,可以在这里添加转换逻辑 return input, nil default: return input, nil } } if err := decoder.Decode(&atomFeed); err != nil { return "", nil, err } // 转换为RssItem数组 var items []RssItem for _, entry := range atomFeed.Entries { rssItem := RssItem{ Title: entry.Title, GUID: entry.ID, Author: entry.Author.Name, } // 获取链接 for _, link := range entry.Link { if link.Rel == "" || link.Rel == "alternate" { rssItem.Link = link.Href break } } // 获取描述内容 if entry.Content.Value != "" { rssItem.Description = entry.Content.Value } else if entry.Summary != "" { rssItem.Description = entry.Summary } // 获取分类 if len(entry.Category) > 0 { rssItem.Category = entry.Category[0].Term } // 解析发布时间 timeStr := entry.Published if timeStr == "" { timeStr = entry.Updated } if timeStr != "" { rssItem.PubDate = parseTimeString(timeStr) } // 生成内容哈希值 content := fmt.Sprintf("%s%s%s", rssItem.Title, rssItem.Link, rssItem.Description) hash := md5.Sum([]byte(content)) rssItem.Hash = fmt.Sprintf("%x", hash) items = append(items, rssItem) } //按时间降序排序 sort.Slice(items, func(i, j int) bool { return items[i].PubDate.After(items[j].PubDate) }) return atomFeed.Title, items, nil } // 解析时间字符串的辅助函数 func parseTimeString(timeStr string) time.Time { // 尝试多种时间格式 timeFormats := []string{ time.RFC3339, // "2006-01-02T15:04:05Z07:00" (ISO 8601, Atom常用) time.RFC3339Nano, // "2006-01-02T15:04:05.999999999Z07:00" time.RFC1123, // "Mon, 02 Jan 2006 15:04:05 MST" (RSS常用) time.RFC1123Z, // "Mon, 02 Jan 2006 15:04:05 -0700" time.RFC822, // "02 Jan 06 15:04 MST" time.RFC822Z, // "02 Jan 06 15:04 -0700" "2006-01-02T15:04:05Z", // UTC时间 "2006-01-02T15:04:05.000Z", // 带毫秒的UTC时间 "2006-01-02 15:04:05", // 简单格式 } for _, format := range timeFormats { if parsedTime, err := time.Parse(format, timeStr); err == nil { return parsedTime } } // 如果所有格式都失败,返回零时间 return time.Time{} }