411 lines
10 KiB
Go
411 lines
10 KiB
Go
package rss
|
||
|
||
import (
|
||
"crypto/md5"
|
||
"encoding/xml"
|
||
"errors"
|
||
"fmt"
|
||
"io"
|
||
"log"
|
||
"net"
|
||
"net/http"
|
||
"net/url"
|
||
"sort"
|
||
"strings"
|
||
"time"
|
||
|
||
"git.lxtend.com/lixiangwuxian/qqbot/config"
|
||
"golang.org/x/net/proxy"
|
||
)
|
||
|
||
// createHTTPClient 创建支持代理的HTTP客户端
|
||
func createHTTPClient() *http.Client {
|
||
client := &http.Client{
|
||
Timeout: 30 * time.Second,
|
||
}
|
||
|
||
// 检查是否配置了代理
|
||
if config.ConfigManager.GetConfig().Management.ProxyAddr != "" {
|
||
proxyURL, err := url.Parse(config.ConfigManager.GetConfig().Management.ProxyAddr)
|
||
if err != nil {
|
||
log.Printf("解析代理地址失败: %v\n", err)
|
||
return client
|
||
}
|
||
|
||
if proxyURL.Scheme == "socks5" {
|
||
log.Printf("使用SOCKS5代理: %s\n", proxyURL.Host)
|
||
dialer, err := proxy.SOCKS5("tcp", proxyURL.Host, nil, proxy.Direct)
|
||
if err == nil {
|
||
client.Transport = &http.Transport{
|
||
Dial: func(network, addr string) (net.Conn, error) {
|
||
return dialer.Dial(network, addr)
|
||
},
|
||
}
|
||
} else {
|
||
log.Printf("创建SOCKS5代理失败: %v\n", err)
|
||
}
|
||
} else {
|
||
log.Printf("使用HTTP/HTTPS代理: %s\n", proxyURL.Host)
|
||
client.Transport = &http.Transport{
|
||
Proxy: http.ProxyURL(proxyURL),
|
||
}
|
||
}
|
||
}
|
||
|
||
return client
|
||
}
|
||
|
||
func CheckRssFeed(feedURL string) error {
|
||
//确认返回头
|
||
client := createHTTPClient()
|
||
resp, err := client.Head(feedURL)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
if resp.StatusCode != 200 {
|
||
return fmt.Errorf("RSS源无效: %d", resp.StatusCode)
|
||
}
|
||
contentType := resp.Header.Get("Content-Type")
|
||
// 支持多种RSS/Atom的Content-Type
|
||
validContentTypes := []string{
|
||
"application/rss+xml",
|
||
"application/atom+xml",
|
||
"application/xml",
|
||
"text/xml",
|
||
}
|
||
|
||
isValid := false
|
||
for _, validType := range validContentTypes {
|
||
if strings.Contains(contentType, validType) {
|
||
isValid = true
|
||
break
|
||
}
|
||
}
|
||
|
||
if !isValid {
|
||
return fmt.Errorf("RSS源无效: %s", resp.Header.Get("Content-Type"))
|
||
}
|
||
return nil
|
||
}
|
||
|
||
func ParseFeed(feedURL string) (string, []RssItem, error) {
|
||
//确认大小
|
||
// resp, err := http.Head(feedURL)
|
||
// if err != nil {
|
||
// return "", nil, err
|
||
// }
|
||
// if resp.StatusCode != 200 {
|
||
// return "", nil, fmt.Errorf("RSS源无效: %d", resp.StatusCode)
|
||
// }
|
||
// if resp.ContentLength == 0 || resp.ContentLength > 1024*1024*10 {
|
||
// return "", nil, fmt.Errorf("RSS源的大小为%d,超出限制", resp.ContentLength)
|
||
// }
|
||
|
||
//获取rss数据
|
||
client := createHTTPClient()
|
||
resp, err := client.Get(feedURL)
|
||
if err != nil {
|
||
return "", nil, err
|
||
}
|
||
defer resp.Body.Close()
|
||
|
||
// 读取响应体内容
|
||
body, err := io.ReadAll(resp.Body)
|
||
if err != nil {
|
||
return "", nil, fmt.Errorf("读取RSS数据失败: %v", err)
|
||
}
|
||
|
||
// 首先尝试解析为RSS格式
|
||
title, items, err := parseRSSFormat(body)
|
||
if err == nil {
|
||
return title, items, nil
|
||
}
|
||
|
||
// 如果RSS格式解析失败,尝试解析为Atom格式
|
||
title, items, err = parseAtomFormat(body)
|
||
if err != nil {
|
||
return "", nil, fmt.Errorf("解析RSS/Atom数据失败: %v", err)
|
||
}
|
||
|
||
if len(items) == 0 {
|
||
return title, nil, errors.New("未解析到rss信息")
|
||
}
|
||
|
||
return title, items, nil
|
||
}
|
||
|
||
// 解析RSS格式
|
||
func parseRSSFormat(data []byte) (string, []RssItem, error) {
|
||
var rssFeed RSSFeed
|
||
decoder := xml.NewDecoder(strings.NewReader(string(data)))
|
||
decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
|
||
// 处理不同的字符编码
|
||
switch charset {
|
||
case "GB2312", "GBK", "GB18030":
|
||
// 如果需要处理中文编码,可以在这里添加转换逻辑
|
||
return input, nil
|
||
default:
|
||
return input, nil
|
||
}
|
||
}
|
||
|
||
if err := decoder.Decode(&rssFeed); err != nil {
|
||
return "", nil, err
|
||
}
|
||
|
||
// 转换为RssItem数组
|
||
var items []RssItem
|
||
for _, item := range rssFeed.Channel.Items {
|
||
rssItem := RssItem{
|
||
Title: item.Title,
|
||
Link: item.Link,
|
||
Description: item.Description,
|
||
GUID: item.GUID,
|
||
Author: item.Author,
|
||
Category: item.Category,
|
||
}
|
||
|
||
// 解析发布时间
|
||
if item.PubDate != "" {
|
||
rssItem.PubDate = parseTimeString(item.PubDate)
|
||
}
|
||
|
||
// 生成标题哈希值
|
||
content := fmt.Sprintf("%s%s", item.Title, item.Link)
|
||
hash := md5.Sum([]byte(content))
|
||
rssItem.Hash = fmt.Sprintf("%x", hash)
|
||
|
||
items = append(items, rssItem)
|
||
}
|
||
|
||
//按时间降序排序
|
||
sort.Slice(items, func(i, j int) bool {
|
||
return items[i].PubDate.After(items[j].PubDate)
|
||
})
|
||
|
||
return rssFeed.Channel.Title, items, nil
|
||
}
|
||
|
||
// 解析Atom格式
|
||
func parseAtomFormat(data []byte) (string, []RssItem, error) {
|
||
var atomFeed AtomFeed
|
||
decoder := xml.NewDecoder(strings.NewReader(string(data)))
|
||
decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
|
||
// 处理不同的字符编码
|
||
switch charset {
|
||
case "GB2312", "GBK", "GB18030":
|
||
// 如果需要处理中文编码,可以在这里添加转换逻辑
|
||
return input, nil
|
||
default:
|
||
return input, nil
|
||
}
|
||
}
|
||
|
||
if err := decoder.Decode(&atomFeed); err != nil {
|
||
return "", nil, err
|
||
}
|
||
|
||
// 转换为RssItem数组
|
||
var items []RssItem
|
||
for _, entry := range atomFeed.Entries {
|
||
rssItem := RssItem{
|
||
Title: entry.Title,
|
||
GUID: entry.ID,
|
||
Author: entry.Author.Name,
|
||
}
|
||
|
||
// 获取链接
|
||
for _, link := range entry.Link {
|
||
if link.Rel == "" || link.Rel == "alternate" {
|
||
rssItem.Link = link.Href
|
||
break
|
||
}
|
||
}
|
||
|
||
// 获取描述内容
|
||
if entry.Content.Value != "" {
|
||
rssItem.Description = entry.Content.Value
|
||
} else if entry.Summary != "" {
|
||
rssItem.Description = entry.Summary
|
||
}
|
||
|
||
// 获取分类
|
||
if len(entry.Category) > 0 {
|
||
rssItem.Category = entry.Category[0].Term
|
||
}
|
||
|
||
// 解析发布时间
|
||
timeStr := entry.Published
|
||
if timeStr == "" {
|
||
timeStr = entry.Updated
|
||
}
|
||
if timeStr != "" {
|
||
rssItem.PubDate = parseTimeString(timeStr)
|
||
}
|
||
|
||
// 生成内容哈希值
|
||
content := fmt.Sprintf("%s%s", rssItem.Title, rssItem.Link)
|
||
hash := md5.Sum([]byte(content))
|
||
rssItem.Hash = fmt.Sprintf("%x", hash)
|
||
|
||
items = append(items, rssItem)
|
||
}
|
||
|
||
//按时间降序排序
|
||
sort.Slice(items, func(i, j int) bool {
|
||
return items[i].PubDate.After(items[j].PubDate)
|
||
})
|
||
|
||
return atomFeed.Title, items, nil
|
||
}
|
||
|
||
// 解析时间字符串的辅助函数
|
||
func parseTimeString(timeStr string) time.Time {
|
||
// 尝试多种时间格式
|
||
timeFormats := []string{
|
||
time.RFC3339, // "2006-01-02T15:04:05Z07:00" (ISO 8601, Atom常用)
|
||
time.RFC3339Nano, // "2006-01-02T15:04:05.999999999Z07:00"
|
||
time.RFC1123, // "Mon, 02 Jan 2006 15:04:05 MST" (RSS常用)
|
||
time.RFC1123Z, // "Mon, 02 Jan 2006 15:04:05 -0700"
|
||
time.RFC822, // "02 Jan 06 15:04 MST"
|
||
time.RFC822Z, // "02 Jan 06 15:04 -0700"
|
||
"2006-01-02T15:04:05Z", // UTC时间
|
||
"2006-01-02T15:04:05.000Z", // 带毫秒的UTC时间
|
||
"2006-01-02 15:04:05", // 简单格式
|
||
}
|
||
|
||
for _, format := range timeFormats {
|
||
if parsedTime, err := time.Parse(format, timeStr); err == nil {
|
||
return parsedTime
|
||
}
|
||
}
|
||
|
||
// 如果所有格式都失败,返回零时间
|
||
return time.Time{}
|
||
}
|
||
|
||
// CheckOPMLFile 检查OPML文件是否有效
|
||
func CheckOPMLFile(opmlURL string) error {
|
||
//确认返回头
|
||
client := createHTTPClient()
|
||
resp, err := client.Head(opmlURL)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
if resp.StatusCode != 200 {
|
||
return fmt.Errorf("OPML文件无效: %d", resp.StatusCode)
|
||
}
|
||
contentType := resp.Header.Get("Content-Type")
|
||
// 支持多种OPML的Content-Type
|
||
validContentTypes := []string{
|
||
"application/xml",
|
||
"text/xml",
|
||
"text/x-opml",
|
||
"application/x-opml+xml",
|
||
}
|
||
|
||
isValid := false
|
||
for _, validType := range validContentTypes {
|
||
if strings.Contains(contentType, validType) {
|
||
isValid = true
|
||
break
|
||
}
|
||
}
|
||
|
||
if !isValid {
|
||
return fmt.Errorf("OPML文件无效: %s", resp.Header.Get("Content-Type"))
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// ParseOPML 解析OPML文件,返回RSS源列表
|
||
func ParseOPML(opmlURL string) ([]OPMLFeedInfo, error) {
|
||
//确认大小
|
||
client := createHTTPClient()
|
||
resp, err := client.Head(opmlURL)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if resp.StatusCode != 200 {
|
||
return nil, fmt.Errorf("OPML文件无效: %d", resp.StatusCode)
|
||
}
|
||
if resp.ContentLength == 0 || resp.ContentLength > 1024*1024*10 {
|
||
return nil, fmt.Errorf("OPML文件的大小为%d,超出限制", resp.ContentLength)
|
||
}
|
||
|
||
//获取OPML数据
|
||
resp, err = client.Get(opmlURL)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
defer resp.Body.Close()
|
||
|
||
// 读取响应体内容
|
||
body, err := io.ReadAll(resp.Body)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("读取OPML数据失败: %v", err)
|
||
}
|
||
|
||
return ParseOPMLFormat(body)
|
||
}
|
||
|
||
// ParseOPMLFormat 解析OPML格式数据
|
||
func ParseOPMLFormat(data []byte) ([]OPMLFeedInfo, error) {
|
||
var opml OPML
|
||
decoder := xml.NewDecoder(strings.NewReader(string(data)))
|
||
decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
|
||
// 处理不同的字符编码
|
||
switch charset {
|
||
case "GB2312", "GBK", "GB18030":
|
||
// 如果需要处理中文编码,可以在这里添加转换逻辑
|
||
return input, nil
|
||
default:
|
||
return input, nil
|
||
}
|
||
}
|
||
|
||
if err := decoder.Decode(&opml); err != nil {
|
||
return nil, fmt.Errorf("解析OPML数据失败: %v", err)
|
||
}
|
||
|
||
var feedInfos []OPMLFeedInfo
|
||
extractFeeds(opml.Body.Outlines, "", &feedInfos)
|
||
|
||
if len(feedInfos) == 0 {
|
||
return nil, errors.New("未在OPML文件中找到RSS源")
|
||
}
|
||
|
||
return feedInfos, nil
|
||
}
|
||
|
||
// extractFeeds 递归提取RSS源信息
|
||
func extractFeeds(outlines []OPMLOutline, category string, feedInfos *[]OPMLFeedInfo) {
|
||
for _, outline := range outlines {
|
||
// 如果有xmlUrl,说明这是一个RSS源
|
||
if outline.XMLURL != "" {
|
||
title := outline.Title
|
||
if title == "" {
|
||
title = outline.Text
|
||
}
|
||
|
||
feedInfo := OPMLFeedInfo{
|
||
Title: title,
|
||
XMLURL: outline.XMLURL,
|
||
HTMLURL: outline.HTMLURL,
|
||
Description: outline.Description,
|
||
Category: category,
|
||
}
|
||
*feedInfos = append(*feedInfos, feedInfo)
|
||
} else if len(outline.Outlines) > 0 {
|
||
// 如果没有xmlUrl但有子outline,说明这是一个分类
|
||
categoryName := outline.Title
|
||
if categoryName == "" {
|
||
categoryName = outline.Text
|
||
}
|
||
|
||
// 递归处理子outline
|
||
extractFeeds(outline.Outlines, categoryName, feedInfos)
|
||
}
|
||
}
|
||
}
|