Files
decypharr/pkg/usenet/parser.go
2025-08-01 15:27:24 +01:00

864 lines
22 KiB
Go

package usenet
import (
"bytes"
"context"
"crypto/sha256"
"encoding/hex"
"fmt"
"github.com/Tensai75/nzbparser"
"github.com/chrisfarms/yenc"
"github.com/nwaples/rardecode/v2"
"github.com/rs/zerolog"
"github.com/sirrobot01/decypharr/internal/nntp"
"github.com/sirrobot01/decypharr/internal/utils"
"github.com/sourcegraph/conc/pool"
"io"
"path"
"path/filepath"
"regexp"
"sort"
"strings"
"sync"
)
// NZBParser provides a simplified, robust NZB parser
type NZBParser struct {
logger zerolog.Logger
client *nntp.Client
cache *SegmentCache
}
type FileGroup struct {
BaseName string
ActualFilename string
Type FileType
Files []nzbparser.NzbFile
Groups map[string]struct{}
}
type FileInfo struct {
Size int64
ChunkSize int64
Name string
}
// NewNZBParser creates a new simplified NZB parser
func NewNZBParser(client *nntp.Client, cache *SegmentCache, logger zerolog.Logger) *NZBParser {
return &NZBParser{
logger: logger.With().Str("component", "nzb_parser").Logger(),
client: client,
cache: cache,
}
}
type FileType int
const (
FileTypeMedia FileType = iota // Direct media files (.mkv, .mp4, etc.) // Check internal/utils.IsMediaFile
FileTypeRar // RAR archives (.rar, .r00, .r01, etc.)
FileTypeArchive // Other archives (.7z, .zip, etc.)
FileTypeIgnore // Files to ignore (.nfo, .txt, par2 etc.)
FileTypeUnknown
)
var (
// RAR file patterns - simplified and more accurate
rarMainPattern = regexp.MustCompile(`\.rar$`)
rarPartPattern = regexp.MustCompile(`\.r\d{2}$`) // .r00, .r01, etc.
rarVolumePattern = regexp.MustCompile(`\.part\d+\.rar$`)
ignoreExtensions = []string{".par2", ".sfv", ".nfo", ".jpg", ".png", ".txt", ".srt", ".idx", ".sub"}
sevenZMainPattern = regexp.MustCompile(`\.7z$`)
sevenZPartPattern = regexp.MustCompile(`\.7z\.\d{3}$`)
extWithNumberPattern = regexp.MustCompile(`\.[^ "\.]*\.\d+$`)
volPar2Pattern = regexp.MustCompile(`(?i)\.vol\d+\+\d+\.par2?$`)
partPattern = regexp.MustCompile(`(?i)\.part\d+\.[^ "\.]*$`)
regularExtPattern = regexp.MustCompile(`\.[^ "\.]*$`)
)
type PositionTracker struct {
reader io.Reader
position int64
}
func (pt *PositionTracker) Read(p []byte) (n int, err error) {
n, err = pt.reader.Read(p)
pt.position += int64(n)
return n, err
}
func (pt *PositionTracker) Position() int64 {
return pt.position
}
func (p *NZBParser) Parse(ctx context.Context, filename string, category string, content []byte) (*NZB, error) {
// Parse raw XML
raw, err := nzbparser.Parse(bytes.NewReader(content))
if err != nil {
return nil, fmt.Errorf("failed to parse NZB content: %w", err)
}
// Create base NZB structure
nzb := &NZB{
Files: []NZBFile{},
Status: "parsed",
Category: category,
Name: determineNZBName(filename, raw.Meta),
Title: raw.Meta["title"],
Password: raw.Meta["password"],
}
// Group files by base name and type
fileGroups := p.groupFiles(ctx, raw.Files)
// Process each group
files := p.processFileGroups(ctx, fileGroups, nzb.Password)
nzb.ID = generateID(nzb)
if len(files) == 0 {
return nil, fmt.Errorf("no valid files found in NZB")
}
// Calculate total size
for _, file := range files {
nzb.TotalSize += file.Size
file.NzbID = nzb.ID
nzb.Files = append(nzb.Files, file)
}
return nzb, nil
}
func (p *NZBParser) groupFiles(ctx context.Context, files nzbparser.NzbFiles) map[string]*FileGroup {
var unknownFiles []nzbparser.NzbFile
var knownFiles []struct {
file nzbparser.NzbFile
fileType FileType
}
for _, file := range files {
if len(file.Segments) == 0 {
continue
}
fileType := p.detectFileType(file.Filename)
if fileType == FileTypeUnknown {
unknownFiles = append(unknownFiles, file)
} else {
knownFiles = append(knownFiles, struct {
file nzbparser.NzbFile
fileType FileType
}{file, fileType})
}
}
p.logger.Info().
Int("known_files", len(knownFiles)).
Int("unknown_files", len(unknownFiles)).
Msg("File type detection")
unknownResults := p.batchDetectContentTypes(ctx, unknownFiles)
allFiles := make([]struct {
file nzbparser.NzbFile
fileType FileType
actualFilename string
}, 0, len(knownFiles)+len(unknownResults))
// Add known files
for _, known := range knownFiles {
allFiles = append(allFiles, struct {
file nzbparser.NzbFile
fileType FileType
actualFilename string
}{known.file, known.fileType, known.file.Filename})
}
// Add unknown results
allFiles = append(allFiles, unknownResults...)
return p.groupProcessedFiles(allFiles)
}
// Batch process unknown files in parallel
func (p *NZBParser) batchDetectContentTypes(ctx context.Context, unknownFiles []nzbparser.NzbFile) []struct {
file nzbparser.NzbFile
fileType FileType
actualFilename string
} {
if len(unknownFiles) == 0 {
return nil
}
// Use worker pool for parallel processing
workers := min(len(unknownFiles), 10) // Max 10 concurrent downloads
workerPool := pool.New().WithMaxGoroutines(workers).WithContext(ctx)
type result struct {
index int
file nzbparser.NzbFile
fileType FileType
actualFilename string
}
results := make([]result, len(unknownFiles))
var mu sync.Mutex
// Process each unknown file
for i, file := range unknownFiles {
i, file := i, file // Capture loop variables
workerPool.Go(func(ctx context.Context) error {
detectedType, actualFilename := p.detectFileTypeByContent(ctx, file)
mu.Lock()
results[i] = result{
index: i,
file: file,
fileType: detectedType,
actualFilename: actualFilename,
}
mu.Unlock()
return nil // Don't fail the entire batch for one file
})
}
// Wait for all to complete
if err := workerPool.Wait(); err != nil {
return nil
}
// Convert results
processedFiles := make([]struct {
file nzbparser.NzbFile
fileType FileType
actualFilename string
}, 0, len(results))
for _, result := range results {
if result.fileType != FileTypeUnknown {
processedFiles = append(processedFiles, struct {
file nzbparser.NzbFile
fileType FileType
actualFilename string
}{result.file, result.fileType, result.actualFilename})
}
}
return processedFiles
}
// Group already processed files (fast)
func (p *NZBParser) groupProcessedFiles(allFiles []struct {
file nzbparser.NzbFile
fileType FileType
actualFilename string
}) map[string]*FileGroup {
groups := make(map[string]*FileGroup)
for _, item := range allFiles {
// Skip unwanted files
if item.fileType == FileTypeIgnore || item.fileType == FileTypeArchive {
continue
}
var groupKey string
if item.actualFilename != "" && item.actualFilename != item.file.Filename {
groupKey = p.getBaseFilename(item.actualFilename)
} else {
groupKey = item.file.Basefilename
}
group, exists := groups[groupKey]
if !exists {
group = &FileGroup{
ActualFilename: item.actualFilename,
BaseName: groupKey,
Type: item.fileType,
Files: []nzbparser.NzbFile{},
Groups: make(map[string]struct{}),
}
groups[groupKey] = group
}
// Update filename
item.file.Filename = item.actualFilename
group.Files = append(group.Files, item.file)
for _, g := range item.file.Groups {
group.Groups[g] = struct{}{}
}
}
return groups
}
func (p *NZBParser) getBaseFilename(filename string) string {
if filename == "" {
return ""
}
// First remove any quotes and trim spaces
cleaned := strings.Trim(filename, `" -`)
// Check for vol\d+\+\d+\.par2? (PAR2 volume files)
if volPar2Pattern.MatchString(cleaned) {
return volPar2Pattern.ReplaceAllString(cleaned, "")
}
// Check for part\d+\.[^ "\.]* (part files like .part01.rar)
if partPattern.MatchString(cleaned) {
return partPattern.ReplaceAllString(cleaned, "")
}
// Check for [^ "\.]*\.\d+ (extensions with numbers like .7z.001, .r01, etc.)
if extWithNumberPattern.MatchString(cleaned) {
return extWithNumberPattern.ReplaceAllString(cleaned, "")
}
// Check for regular extensions [^ "\.]*
if regularExtPattern.MatchString(cleaned) {
return regularExtPattern.ReplaceAllString(cleaned, "")
}
return cleaned
}
// Simplified file type detection
func (p *NZBParser) detectFileType(filename string) FileType {
lower := strings.ToLower(filename)
// Check for media first
if p.isMediaFile(lower) {
return FileTypeMedia
}
// Check rar next
if p.isRarFile(lower) {
return FileTypeRar
}
// Check for 7z files
if sevenZMainPattern.MatchString(lower) || sevenZPartPattern.MatchString(lower) {
return FileTypeArchive
}
if strings.HasSuffix(lower, ".zip") || strings.HasSuffix(lower, ".tar") ||
strings.HasSuffix(lower, ".gz") || strings.HasSuffix(lower, ".bz2") {
return FileTypeArchive
}
// Check for ignored file types
for _, ext := range ignoreExtensions {
if strings.HasSuffix(lower, ext) {
return FileTypeIgnore
}
}
// Default to unknown type
return FileTypeUnknown
}
// Simplified RAR detection
func (p *NZBParser) isRarFile(filename string) bool {
return rarMainPattern.MatchString(filename) ||
rarPartPattern.MatchString(filename) ||
rarVolumePattern.MatchString(filename)
}
func (p *NZBParser) isMediaFile(filename string) bool {
return utils.IsMediaFile(filename)
}
func (p *NZBParser) processFileGroups(ctx context.Context, groups map[string]*FileGroup, password string) []NZBFile {
if len(groups) == 0 {
return nil
}
// Channel to collect results
results := make(chan *NZBFile, len(groups))
var wg sync.WaitGroup
// Process each group concurrently
for _, group := range groups {
wg.Add(1)
go func(g *FileGroup) {
defer wg.Done()
file := p.processFileGroup(ctx, g, password)
results <- file // nil values are fine, we'll filter later
}(group)
}
// Close results channel when all goroutines complete
go func() {
wg.Wait()
close(results)
}()
// Collect results
var files []NZBFile
for file := range results {
if file != nil {
files = append(files, *file)
}
}
return files
}
// Simplified individual group processing
func (p *NZBParser) processFileGroup(ctx context.Context, group *FileGroup, password string) *NZBFile {
switch group.Type {
case FileTypeMedia:
return p.processMediaFile(group, password)
case FileTypeRar:
return p.processRarArchive(ctx, group, password)
case FileTypeArchive:
return nil
default:
// Treat unknown files as media files with conservative estimation
return p.processMediaFile(group, password)
}
}
// Process regular media files
func (p *NZBParser) processMediaFile(group *FileGroup, password string) *NZBFile {
if len(group.Files) == 0 {
return nil
}
// Sort files for consistent ordering
sort.Slice(group.Files, func(i, j int) bool {
return group.Files[i].Number < group.Files[j].Number
})
// Determine extension
ext := p.determineExtension(group)
file := &NZBFile{
Name: group.BaseName + ext,
Groups: p.getGroupsList(group.Groups),
Segments: []NZBSegment{},
Password: password,
IsRarArchive: false,
}
currentOffset := int64(0)
ratio := 0.968
for _, nzbFile := range group.Files {
sort.Slice(nzbFile.Segments, func(i, j int) bool {
return nzbFile.Segments[i].Number < nzbFile.Segments[j].Number
})
for _, segment := range nzbFile.Segments {
decodedSize := int64(float64(segment.Bytes) * ratio)
seg := NZBSegment{
Number: segment.Number,
MessageID: segment.Id,
Bytes: int64(segment.Bytes),
StartOffset: currentOffset,
EndOffset: currentOffset + decodedSize,
Group: file.Groups[0],
}
file.Segments = append(file.Segments, seg)
currentOffset += decodedSize
}
}
fileInfo, err := p.getFileInfo(context.Background(), group)
if err != nil {
p.logger.Warn().Err(err).Msg("Failed to get file info, using fallback")
file.Size = currentOffset
file.SegmentSize = currentOffset / int64(len(file.Segments)) // Average segment size
} else {
file.Size = fileInfo.Size
file.SegmentSize = fileInfo.ChunkSize
}
return file
}
func (p *NZBParser) processRarArchive(ctx context.Context, group *FileGroup, password string) *NZBFile {
if len(group.Files) == 0 {
return nil
}
// Sort RAR files by part number
sort.Slice(group.Files, func(i, j int) bool {
return group.Files[i].Filename < group.Files[j].Filename
})
// Try to extract RAR info during parsing for better accuracy
extractedInfo := p.extractRarInfo(ctx, group, password)
filename := group.BaseName + ".mkv" // Default extension
if extractedInfo != nil && extractedInfo.FileName != "" {
filename = extractedInfo.FileName
}
filename = utils.RemoveInvalidChars(path.Base(filename))
file := &NZBFile{
Name: filename,
Groups: p.getGroupsList(group.Groups),
Segments: []NZBSegment{},
Password: password,
IsRarArchive: true,
}
// Build segments
ratio := 0.968
currentOffset := int64(0)
for _, nzbFile := range group.Files {
sort.Slice(nzbFile.Segments, func(i, j int) bool {
return nzbFile.Segments[i].Number < nzbFile.Segments[j].Number
})
for _, segment := range nzbFile.Segments {
decodedSize := int64(float64(segment.Bytes) * ratio)
seg := NZBSegment{
Number: segment.Number,
MessageID: segment.Id,
Bytes: int64(segment.Bytes),
StartOffset: currentOffset,
EndOffset: currentOffset + decodedSize,
Group: file.Groups[0],
}
file.Segments = append(file.Segments, seg)
currentOffset += decodedSize
}
}
if extractedInfo != nil {
file.Size = extractedInfo.FileSize
file.SegmentSize = extractedInfo.SegmentSize
file.StartOffset = extractedInfo.EstimatedStartOffset
} else {
file.Size = currentOffset
file.SegmentSize = currentOffset / int64(len(file.Segments)) // Average segment size
file.StartOffset = 0 // No accurate start offset available
}
return file
}
func (p *NZBParser) getFileInfo(ctx context.Context, group *FileGroup) (*FileInfo, error) {
if len(group.Files) == 0 {
return nil, fmt.Errorf("no files in group %s", group.BaseName)
}
// Sort files
sort.Slice(group.Files, func(i, j int) bool {
return group.Files[i].Filename < group.Files[j].Filename
})
firstFile := group.Files[0]
lastFile := group.Files[len(group.Files)-1]
firstInfo, err := p.client.DownloadHeader(ctx, firstFile.Segments[0].Id)
if err != nil {
return nil, err
}
lastInfo, err := p.client.DownloadHeader(ctx, lastFile.Segments[len(lastFile.Segments)-1].Id)
if err != nil {
p.logger.Warn().Err(err).Msg("Failed to download last segment header")
return nil, err
}
chunkSize := firstInfo.End - (firstInfo.Begin - 1)
totalFileSize := (int64(len(group.Files)-1) * firstInfo.Size) + lastInfo.Size
return &FileInfo{
Size: totalFileSize,
ChunkSize: chunkSize,
Name: firstInfo.Name,
}, nil
}
func (p *NZBParser) extractRarInfo(ctx context.Context, group *FileGroup, password string) *ExtractedFileInfo {
if len(group.Files) == 0 || len(group.Files[0].Segments) == 0 {
return nil
}
firstRarFile := group.Files[0]
segmentsToDownload := min(5, len(firstRarFile.Segments))
headerBuffer, err := p.downloadRarHeaders(ctx, firstRarFile.Segments[:segmentsToDownload])
if err != nil {
p.logger.Warn().Err(err).Msg("Failed to download RAR headers")
return nil
}
fileInfo, err := p.getFileInfo(ctx, group)
if err != nil {
p.logger.Warn().Err(err).Msg("Failed to get file info for RAR group")
return nil
}
// Pass the actual RAR size to the analysis function
return p.analyzeRarStructure(headerBuffer, password, fileInfo)
}
func (p *NZBParser) analyzeRarStructure(headerData []byte, password string, fileInfo *FileInfo) *ExtractedFileInfo {
reader := bytes.NewReader(headerData)
tracker := &PositionTracker{reader: reader, position: 0}
rarReader, err := rardecode.NewReader(tracker, rardecode.Password(password))
if err != nil {
return nil
}
for {
header, err := rarReader.Next()
if err != nil {
break
}
if !header.IsDir && p.isMediaFile(header.Name) {
compressionRatio := float64(fileInfo.Size) / float64(header.UnPackedSize)
if compressionRatio > 0.95 {
fileDataOffset := tracker.Position()
p.logger.Info().
Str("file", header.Name).
Int64("accurate_offset", fileDataOffset).
Float64("compression_ratio", compressionRatio).
Msg("Found accurate store RAR offset using position tracking")
return &ExtractedFileInfo{
FileName: header.Name,
FileSize: header.UnPackedSize,
SegmentSize: fileInfo.ChunkSize,
EstimatedStartOffset: fileDataOffset,
}
}
break
}
// Skip file content - this advances the tracker position
io.Copy(io.Discard, rarReader)
}
return nil
}
func (p *NZBParser) determineExtension(group *FileGroup) string {
// Try to determine extension from filenames
for _, file := range group.Files {
ext := filepath.Ext(file.Filename)
if ext != "" {
return ext
}
}
return ".mkv" // Default
}
func (p *NZBParser) getGroupsList(groups map[string]struct{}) []string {
result := make([]string, 0, len(groups))
for g := range groups {
result = append(result, g)
}
return result
}
// Download RAR headers from segments
func (p *NZBParser) downloadRarHeaders(ctx context.Context, segments []nzbparser.NzbSegment) ([]byte, error) {
var headerBuffer bytes.Buffer
for _, segment := range segments {
conn, cleanup, err := p.client.GetConnection(ctx)
if err != nil {
continue
}
data, err := conn.GetBody(segment.Id)
cleanup()
if err != nil {
if !nntp.IsRetryableError(err) {
return nil, err
}
continue
}
if len(data) == 0 {
continue
}
// yEnc decode
part, err := nntp.DecodeYenc(bytes.NewReader(data))
if err != nil || part == nil || len(part.Body) == 0 {
p.logger.Warn().Err(err).Str("segment_id", segment.Id).Msg("Failed to decode RAR header segment")
continue
}
headerBuffer.Write(part.Body)
// Stop if we have enough data (typically first segment is enough for headers)
if headerBuffer.Len() > 32768 { // 32KB should be plenty for RAR headers
break
}
}
if headerBuffer.Len() == 0 {
return nil, fmt.Errorf("no valid header data downloaded")
}
return headerBuffer.Bytes(), nil
}
func (p *NZBParser) detectFileTypeByContent(ctx context.Context, file nzbparser.NzbFile) (FileType, string) {
if len(file.Segments) == 0 {
return FileTypeUnknown, ""
}
// Download first segment to check file signature
firstSegment := file.Segments[0]
data, err := p.downloadFirstSegment(ctx, firstSegment)
if err != nil {
p.logger.Warn().Err(err).Msg("Failed to download first segment for content detection")
return FileTypeUnknown, ""
}
if data.Name != "" {
fileType := p.detectFileType(data.Name)
if fileType != FileTypeUnknown {
return fileType, data.Name
}
}
return p.detectFileTypeFromContent(data.Body), data.Name
}
func (p *NZBParser) detectFileTypeFromContent(data []byte) FileType {
if len(data) == 0 {
return FileTypeUnknown
}
// Check for RAR signatures (both RAR 4.x and 5.x)
if len(data) >= 7 {
// RAR 4.x signature
if bytes.Equal(data[:7], []byte("Rar!\x1A\x07\x00")) {
return FileTypeRar
}
}
if len(data) >= 8 {
// RAR 5.x signature
if bytes.Equal(data[:8], []byte("Rar!\x1A\x07\x01\x00")) {
return FileTypeRar
}
}
// Check for ZIP signature
if len(data) >= 4 && bytes.Equal(data[:4], []byte{0x50, 0x4B, 0x03, 0x04}) {
return FileTypeArchive
}
// Check for 7z signature
if len(data) >= 6 && bytes.Equal(data[:6], []byte{0x37, 0x7A, 0xBC, 0xAF, 0x27, 0x1C}) {
return FileTypeArchive
}
// Check for common media file signatures
if len(data) >= 4 {
// Matroska (MKV/WebM)
if bytes.Equal(data[:4], []byte{0x1A, 0x45, 0xDF, 0xA3}) {
return FileTypeMedia
}
// MP4/MOV (check for 'ftyp' at offset 4)
if len(data) >= 8 && bytes.Equal(data[4:8], []byte("ftyp")) {
return FileTypeMedia
}
// AVI
if len(data) >= 12 && bytes.Equal(data[:4], []byte("RIFF")) &&
bytes.Equal(data[8:12], []byte("AVI ")) {
return FileTypeMedia
}
}
// MPEG checks need more specific patterns
if len(data) >= 4 {
// MPEG-1/2 Program Stream
if bytes.Equal(data[:4], []byte{0x00, 0x00, 0x01, 0xBA}) {
return FileTypeMedia
}
// MPEG-1/2 Video Stream
if bytes.Equal(data[:4], []byte{0x00, 0x00, 0x01, 0xB3}) {
return FileTypeMedia
}
}
// Check for Transport Stream (TS files)
if len(data) >= 1 && data[0] == 0x47 {
// Additional validation for TS files
if len(data) >= 188 && data[188] == 0x47 {
return FileTypeMedia
}
}
return FileTypeUnknown
}
func (p *NZBParser) downloadFirstSegment(ctx context.Context, segment nzbparser.NzbSegment) (*yenc.Part, error) {
conn, cleanup, err := p.client.GetConnection(ctx)
if err != nil {
return nil, err
}
defer cleanup()
data, err := conn.GetBody(segment.Id)
if err != nil {
return nil, err
}
// yEnc decode
part, err := nntp.DecodeYenc(bytes.NewReader(data))
if err != nil || part == nil {
return nil, fmt.Errorf("failed to decode segment")
}
// Return both the filename and decoded data
return part, nil
}
// Calculate total archive size from all RAR parts in the group
func (p *NZBParser) calculateTotalArchiveSize(group *FileGroup) int64 {
var total int64
for _, file := range group.Files {
for _, segment := range file.Segments {
total += int64(segment.Bytes)
}
}
return total
}
func determineNZBName(filename string, meta map[string]string) string {
// Prefer filename if it exists
if filename != "" {
filename = strings.Replace(filename, filepath.Ext(filename), "", 1)
} else {
if name := meta["name"]; name != "" {
filename = name
} else if title := meta["title"]; title != "" {
filename = title
}
}
return utils.RemoveInvalidChars(filename)
}
func generateID(nzb *NZB) string {
h := sha256.New()
h.Write([]byte(nzb.Name))
h.Write([]byte(fmt.Sprintf("%d", nzb.TotalSize)))
h.Write([]byte(nzb.Category))
h.Write([]byte(nzb.Password))
return hex.EncodeToString(h.Sum(nil))[:16]
}