package main
import (
"fmt"
"os"
"path/filepath"
"strings"
"time"
"github.com/stuckj/mkvdup/internal/dedup"
"github.com/stuckj/mkvdup/internal/mkv"
"github.com/stuckj/mkvdup/internal/source"
)
// sourceGroup represents a set of files sharing the same source directory.
type sourceGroup struct {
sourceDir string
indices []int // indices into the manifest Files slice
}
// groupBySource groups batch manifest files by their SourceDir.
// Groups are returned in first-seen order, and file indices within each group
// preserve their original manifest order.
func groupBySource(files []dedup.BatchManifestFile) []sourceGroup {
var groups []sourceGroup
seen := map[string]int{} // sourceDir -> index in groups
for i, f := range files {
if gi, ok := seen[f.SourceDir]; ok {
groups[gi].indices = append(groups[gi].indices, i)
} else {
seen[f.SourceDir] = len(groups)
groups = append(groups, sourceGroup{sourceDir: f.SourceDir, indices: []int{i}})
}
}
return groups
}
// codecMismatchAction controls how reportCodecMismatches handles a mismatch.
type codecMismatchAction int
const (
codecMismatchPrompt codecMismatchAction = iota // interactive: prompt user
codecMismatchContinue // non-interactive: warn and continue
codecMismatchSkip // skip: warn and signal skip
)
// reportCodecMismatches prints codec mismatch warnings and handles the response
// based on the action: prompt the user, continue without prompting (still logging to stderr),
// or signal a skip. Returns an error if the user declines to continue (prompt mode only).
func reportCodecMismatches(mismatches []source.CodecMismatch, action codecMismatchAction) error {
if len(mismatches) == 0 {
return nil
}
// Print warning to stderr (always visible, even in quiet mode)
printWarnln()
printWarnln(" WARNING: Codec mismatch detected")
for _, m := range mismatches {
mkvName := source.CodecTypeName(m.MKVCodecType)
var sourceNames []string
for _, sc := range m.SourceCodecs {
sourceNames = append(sourceNames, source.CodecTypeName(sc))
}
printWarn(" MKV %s: %s (%s)\n", m.TrackType, mkvName, m.MKVCodecID)
printWarn(" Source %s: %s\n", m.TrackType, strings.Join(sourceNames, ", "))
}
printWarnln()
printWarnln(" Deduplication may produce poor results if the MKV was transcoded.")
switch action {
case codecMismatchSkip:
printWarnln(" Skipping (--skip-codec-mismatch)...")
printWarnln()
return nil
case codecMismatchContinue:
printWarnln(" Continuing (non-interactive mode)...")
printWarnln()
return nil
default:
// Interactive prompt — auto-continue if stdin is not a terminal
if !isTerminal() {
printWarnln(" Continuing (non-interactive mode)...")
printWarnln()
return nil
}
fmt.Print("\n Continue anyway? [y/N]: ")
var response string
fmt.Scanln(&response)
response = strings.TrimSpace(strings.ToLower(response))
if response != "y" && response != "yes" {
return fmt.Errorf("aborted due to codec mismatch")
}
fmt.Println()
return nil
}
}
// createBatch processes multiple MKVs from a batch manifest.
// Files are grouped by source directory so each source is indexed once.
// If skipCodecMismatch is true, MKVs with codec mismatches are skipped instead of processed.
func createBatch(manifestPath string, warnThreshold float64, skipCodecMismatch bool) error {
totalStart := time.Now()
manifest, err := dedup.ReadBatchManifest(manifestPath)
if err != nil {
return err
}
groups := groupBySource(manifest.Files)
multiSource := len(groups) > 1
if multiSource {
printInfo("Batch create: %d %s from %d %s\n\n",
len(manifest.Files), plural(len(manifest.Files), "file", "files"),
len(groups), plural(len(groups), "source", "sources"))
} else {
printInfo("Batch create: %d %s from %s\n\n",
len(manifest.Files), plural(len(manifest.Files), "file", "files"), groups[0].sourceDir)
}
results := make([]*createResult, len(manifest.Files))
skipReasons := make([]string, len(manifest.Files))
var totalIndexDuration time.Duration
processed := 0
for gi, g := range groups {
if multiSource {
if gi > 0 {
printInfoln()
}
fileWord := "files"
if len(g.indices) == 1 {
fileWord = "file"
}
printInfo("--- Source %d/%d: %s (%d %s) ---\n", gi+1, len(groups), g.sourceDir, len(g.indices), fileWord)
}
// Pre-check: skip files whose output already exists (resuming interrupted batch)
for _, fi := range g.indices {
f := manifest.Files[fi]
if _, err := os.Stat(f.Output); err == nil {
skipReasons[fi] = "output exists"
}
}
// Pre-check: detect source codecs and warn about incompatible MKVs
// before the expensive indexing step.
sourceCodecs, codecErr := source.DetectSourceCodecsFromDir(g.sourceDir)
if codecErr != nil {
if vw := verboseWriter(); vw != nil {
fmt.Fprintf(vw, "Note: could not detect source codecs for %s: %v\n", g.sourceDir, codecErr)
}
printInfoln()
} else {
for _, fi := range g.indices {
if skipReasons[fi] != "" {
continue
}
f := manifest.Files[fi]
codecParser, err := mkv.NewParser(f.MKV)
if err != nil {
if vw := verboseWriter(); vw != nil {
fmt.Fprintf(vw, "Note: skipping codec pre-check for %s: %v\n", filepath.Base(f.MKV), err)
}
continue
}
if err := codecParser.ParseTracksOnly(); err != nil {
codecParser.Close()
if vw := verboseWriter(); vw != nil {
fmt.Fprintf(vw, "Note: skipping codec pre-check for %s: %v\n", filepath.Base(f.MKV), err)
}
continue
}
mismatches := source.CheckCodecCompatibility(codecParser.Tracks(), sourceCodecs)
codecParser.Close()
if skipCodecMismatch && len(mismatches) > 0 {
reportCodecMismatches(mismatches, codecMismatchSkip)
skipReasons[fi] = "codec mismatch"
continue
}
if err := reportCodecMismatches(mismatches, codecMismatchContinue); err != nil {
return err
}
}
printInfoln()
}
// Check if all files in this group are already skipped — skip indexing entirely
allSkipped := true
for _, fi := range g.indices {
if skipReasons[fi] == "" {
allSkipped = false
break
}
}
if allSkipped {
for _, fi := range g.indices {
processed++
f := manifest.Files[fi]
printInfo("\n[%d/%d] %s\n", processed, len(manifest.Files), f.MKV)
results[fi] = newSkipResult(f.MKV, f.Output, skipReasons[fi])
printSkipStatus(results[fi])
}
continue
}
// Index this source directory
indexLabel := "Indexing source directory..."
if multiSource {
indexLabel = fmt.Sprintf("Indexing source %d/%d...", gi+1, len(groups))
}
indexStart := time.Now()
indexer, index, err := buildSourceIndex(g.sourceDir, indexLabel)
totalIndexDuration += time.Since(indexStart)
if err != nil {
printWarn(" ERROR indexing %s: %v\n", g.sourceDir, err)
// Mark non-skipped files in this group as failed
for _, fi := range g.indices {
processed++
f := manifest.Files[fi]
printInfo("\n[%d/%d] %s\n", processed, len(manifest.Files), f.MKV)
if skipReasons[fi] != "" {
results[fi] = newSkipResult(f.MKV, f.Output, skipReasons[fi])
printSkipStatus(results[fi])
} else {
results[fi] = &createResult{MkvPath: f.MKV, Err: fmt.Errorf("index %s: %w", g.sourceDir, err)}
}
}
if gi < len(groups)-1 {
printWarnln(" Continuing with remaining sources...")
}
continue
}
// Process files in this group
for _, fi := range g.indices {
processed++
f := manifest.Files[fi]
printInfo("\n[%d/%d] %s\n", processed, len(manifest.Files), f.MKV)
if skipReasons[fi] != "" {
results[fi] = newSkipResult(f.MKV, f.Output, skipReasons[fi])
printSkipStatus(results[fi])
continue
}
results[fi] = createDedupWithIndex(f.MKV, f.SourceDir, f.Output, f.Name, indexer, index, 1, 4, true, skipCodecMismatch)
r := results[fi]
if r.Skipped {
printSkipStatus(r)
} else if r.Err != nil {
printWarn(" ERROR: %v\n", r.Err)
if processed < len(manifest.Files) {
printWarnln(" Continuing with remaining files...")
}
} else if r.VerifyErr != nil {
// Verification error messages are already printed by createDedupWithIndex
if processed < len(manifest.Files) {
printWarnln(" Continuing with remaining files...")
}
} else {
printInfo(" MKV: %s bytes | Dedup: %s bytes | Savings: %.1f%% | Time: %v\n",
formatInt(r.MkvSize), formatInt(r.DedupSize), r.Savings, r.Duration.Round(time.Second))
}
}
index.Close()
}
// Print summary
printBatchSummary(results, totalIndexDuration, totalStart, warnThreshold)
// Return error only if there were non-skipped files and all of them failed.
// All-skipped batches (e.g., codec mismatch) are not considered failures.
// "output exists" (cached) files count as successes — they represent prior
// successful runs and are shown as "OK [cached]" in the summary.
hasNonSkipped := false
anyNonSkippedSucceeded := false
for _, r := range results {
isSkipped := r.Skipped && r.SkipReason != "output exists"
if isSkipped {
continue
}
hasNonSkipped = true
if r.Err == nil && r.VerifyErr == nil {
anyNonSkippedSucceeded = true
break
}
}
if hasNonSkipped && !anyNonSkippedSucceeded {
return fmt.Errorf("batch create completed with errors")
}
return nil
}
// newSkipResult creates a createResult for a skipped file. When the skip reason
// is "output exists", it populates stats from the existing MKV and dedup files.
func newSkipResult(mkvPath, outputPath, reason string) *createResult {
r := &createResult{MkvPath: mkvPath, Skipped: true, SkipReason: reason}
if reason == "output exists" {
r.OutputPath = outputPath
if mkvStat, err := os.Stat(mkvPath); err == nil {
r.MkvSize = mkvStat.Size()
}
if dedupStat, err := os.Stat(outputPath); err == nil {
r.DedupSize = dedupStat.Size()
if r.MkvSize > 0 {
r.Savings = float64(r.MkvSize-r.DedupSize) / float64(r.MkvSize) * 100
}
}
}
return r
}
// printSkipStatus prints the per-file skip message during batch processing.
// For "output exists" skips with populated stats, it shows file sizes and savings.
func printSkipStatus(r *createResult) {
if r.SkipReason == "output exists" && r.MkvSize > 0 {
printInfo(" Skipping (%s): %s bytes | Dedup: %s bytes | Savings: %.1f%%\n",
r.SkipReason, formatInt(r.MkvSize), formatInt(r.DedupSize), r.Savings)
} else {
printInfo(" Skipping (%s)\n", r.SkipReason)
}
}
// printBatchSummary prints the aggregate results of a batch create operation.
func printBatchSummary(results []*createResult, indexDuration time.Duration, totalStart time.Time, warnThreshold float64) {
printInfoln()
printInfoln("=== Batch Results ===")
printInfo("Total time: %v (indexing: %v)\n\n", time.Since(totalStart), indexDuration)
succeeded := 0
cached := 0
skipped := 0
verifyFailed := 0
var lowSavings []string
for _, r := range results {
if r.Skipped && r.SkipReason == "output exists" {
// Already-processed files: show as OK with stats
cached++
if r.OutputPath != "" {
printInfo(" OK %s -> %s (%.1f%% savings) [cached]\n", r.MkvPath, filepath.Base(r.OutputPath), r.Savings)
} else {
printInfo(" OK %s [cached]\n", r.MkvPath)
}
if r.Savings < warnThreshold && r.MkvSize > 0 {
lowSavings = append(lowSavings, fmt.Sprintf(" %s: %.1f%% savings", r.MkvPath, r.Savings))
}
} else if r.Skipped {
printInfo(" SKIP %s: %s\n", r.MkvPath, r.SkipReason)
skipped++
} else if r.Err != nil {
printWarn(" FAIL %s: %v\n", r.MkvPath, r.Err)
} else if r.VerifyErr != nil {
printWarn(" FAIL %s: verification failed: %v\n", r.MkvPath, r.VerifyErr)
verifyFailed++
} else {
printInfo(" OK %s -> %s (%.1f%% savings)\n", r.MkvPath, filepath.Base(r.OutputPath), r.Savings)
if r.Savings < warnThreshold {
lowSavings = append(lowSavings, fmt.Sprintf(" %s: %.1f%% savings", r.MkvPath, r.Savings))
}
}
if (!r.Skipped && r.Err == nil && r.VerifyErr == nil) || (r.Skipped && r.SkipReason == "output exists") {
succeeded++
}
}
// Build summary line with optional qualifiers
var qualifiers []string
if cached > 0 {
qualifiers = append(qualifiers, fmt.Sprintf("%d cached", cached))
}
if verifyFailed > 0 {
qualifiers = append(qualifiers, fmt.Sprintf("%d verification failed", verifyFailed))
}
if skipped > 0 {
qualifiers = append(qualifiers, fmt.Sprintf("%d skipped", skipped))
}
if len(qualifiers) > 0 {
printInfo("\nSucceeded: %d/%d (%s)\n", succeeded, len(results), strings.Join(qualifiers, ", "))
} else {
printInfo("\nSucceeded: %d/%d\n", succeeded, len(results))
}
if !quiet && len(lowSavings) > 0 {
printInfo("\nWARNING: %d %s with space savings below %.0f%%:\n", len(lowSavings), plural(len(lowSavings), "file", "files"), warnThreshold)
for _, s := range lowSavings {
printInfoln(s)
}
printInfoln(" This may indicate wrong source, transcoded MKV, or very small MKV file.")
}
}
package main
import (
"fmt"
"log"
"os"
"path/filepath"
"sort"
"strings"
"time"
"github.com/stuckj/mkvdup/internal/dedup"
"github.com/stuckj/mkvdup/internal/matcher"
"github.com/stuckj/mkvdup/internal/mkv"
"github.com/stuckj/mkvdup/internal/source"
)
// osRemove and osRename are wrappers for testing. Override in tests to
// simulate filesystem errors without touching real files.
var osRemove = os.Remove
var osRename = os.Rename
// parseMKVWithProgress parses an MKV file with progress reporting.
// The phasePrefix is shown during parsing (e.g., "Phase 3/6: Parsing MKV file...").
// Returns the parser (caller must Close it) and an error if any.
func parseMKVWithProgress(mkvPath, phasePrefix string) (*mkv.Parser, time.Duration, error) {
parser, err := mkv.NewParser(mkvPath)
if err != nil {
return nil, 0, fmt.Errorf("create parser: %w", err)
}
bar := newProgressBar(phasePrefix, parser.Size(), "bytes")
parseStart := time.Now()
if err := parser.Parse(func(processed, total int64) {
bar.Update(processed)
}); err != nil {
bar.Cancel()
parser.Close()
return nil, 0, fmt.Errorf("parse MKV: %w", err)
}
bar.Finish()
elapsed := time.Since(parseStart)
return parser, elapsed, nil
}
// createResult holds per-file statistics from a create operation.
type createResult struct {
MkvPath string
OutputPath string
VirtualName string
MkvSize int64
DedupSize int64
MatchedBytes int64
UnmatchedBytes int64
MatchedPackets int
TotalPackets int
IndexEntries int
Savings float64
Duration time.Duration
Err error
VerifyErr error // non-nil if post-create verification failed
Skipped bool // true when file was skipped (e.g., codec mismatch, output exists)
SkipReason string // reason for skipping (shown in summary)
}
// buildSourceIndex indexes a source directory and returns the indexer and index.
// This is the expensive step that should only happen once in batch mode.
// The phasePrefix is shown on the progress bar (e.g., "Phase 2/6: Building source index...").
func buildSourceIndex(sourceDir, phasePrefix string) (*source.Indexer, *source.Index, error) {
indexer, err := source.NewIndexer(sourceDir, source.DefaultWindowSize)
if err != nil {
return nil, nil, fmt.Errorf("create indexer: %w", err)
}
indexer.SetVerboseWriter(verboseWriter())
// We don't know total size until Build starts calling back with it,
// so create bar with 0 and let first Update set the total.
bar := newProgressBar(phasePrefix, 0, "bytes")
err = indexer.Build(func(processed, total int64) {
if bar.total == 0 && total > 0 {
bar.total = total
}
bar.Update(processed)
})
if err != nil {
bar.Cancel()
return nil, nil, fmt.Errorf("build index: %w", err)
}
bar.Finish()
index := indexer.Index()
printInfo(" Indexed %d hashes\n", len(index.HashToLocations))
if index.UsesESOffsets {
printInfo(" (Using ES-aware indexing for %v)\n", indexer.SourceType())
}
return indexer, index, nil
}
// checkCodecCompatibilityFromDir performs a lightweight codec check using only
// the source directory (no index needed). This runs before the expensive indexing step.
func checkCodecCompatibilityFromDir(tracks []mkv.Track, sourceDir string, nonInteractive bool) error {
sourceCodecs, err := source.DetectSourceCodecsFromDir(sourceDir)
if err != nil {
if vw := verboseWriter(); vw != nil {
fmt.Fprintf(vw, " Note: could not detect source codecs: %v\n", err)
}
return nil
}
mismatches := source.CheckCodecCompatibility(tracks, sourceCodecs)
action := codecMismatchPrompt
if nonInteractive {
action = codecMismatchContinue
}
return reportCodecMismatches(mismatches, action)
}
// createDedupWithIndex processes a single MKV using a pre-built source index.
// It handles parsing, matching, writing, and verification.
// phaseStart and phaseTotal control phase numbering (e.g., 3,6 for single create; 1,4 for batch).
// If nonInteractive is true, codec mismatch warnings do not prompt the user.
// If skipCodecMismatch is true, the result is marked as Skipped on codec mismatch instead of continuing.
func createDedupWithIndex(mkvPath, sourceDir, outputPath, virtualName string,
indexer *source.Indexer, index *source.Index, phaseStart, phaseTotal int, nonInteractive, skipCodecMismatch bool) *createResult {
start := time.Now()
result := &createResult{
MkvPath: mkvPath,
OutputPath: outputPath,
VirtualName: virtualName,
}
phaseLabel := func(offset int, label string) string {
return fmt.Sprintf("Phase %d/%d: %s", phaseStart+offset, phaseTotal, label)
}
// Parse MKV
parser, _, err := parseMKVWithProgress(mkvPath, phaseLabel(0, "Parsing MKV file..."))
if err != nil {
result.Err = err
return result
}
defer parser.Close()
// Fallback codec check using the index (in case the pre-indexing directory-based
// check was skipped, e.g. detection failure or batch mode with undetectable codecs)
sourceCodecs, codecErr := source.DetectSourceCodecs(index)
if codecErr == nil {
mismatches := source.CheckCodecCompatibility(parser.Tracks(), sourceCodecs)
if skipCodecMismatch && len(mismatches) > 0 {
reportCodecMismatches(mismatches, codecMismatchSkip)
result.Skipped = true
result.SkipReason = "codec mismatch"
return result
}
action := codecMismatchPrompt
if nonInteractive {
action = codecMismatchContinue
}
if err := reportCodecMismatches(mismatches, action); err != nil {
result.Err = err
return result
}
}
// Calculate MKV checksum
printInfo(" Calculating MKV checksum...")
mkvChecksum, err := calculateFileChecksum(mkvPath)
if err != nil {
result.Err = fmt.Errorf("calculate MKV checksum: %w", err)
return result
}
printInfo(" done\n")
// Match packets
m, err := matcher.NewMatcher(index)
if err != nil {
result.Err = fmt.Errorf("create matcher: %w", err)
return result
}
defer m.Close()
m.SetVerboseWriter(verboseWriter())
matchBar := newProgressBar(phaseLabel(1, "Matching packets..."), int64(len(parser.Packets())), "packets")
matchResult, err := m.Match(mkvPath, parser.Packets(), parser.Tracks(), func(processed, total int) {
matchBar.Update(int64(processed))
})
if err != nil {
matchBar.Cancel()
result.Err = fmt.Errorf("match: %w", err)
return result
}
defer matchResult.Close()
matchBar.Finish()
// Write dedup file
writer, err := dedup.NewWriter(outputPath)
if err != nil {
result.Err = fmt.Errorf("create dedup writer: %w", err)
return result
}
defer writer.Close()
writer.SetHeader(parser.Size(), mkvChecksum, indexer.SourceType())
writer.SetCreatorVersion("mkvdup " + version)
writer.SetSourceFiles(index.Files)
// For sources with ES offsets, decide between V3 (convert to raw) and V4 (range maps).
// V4 stores ES offsets with embedded range maps for ES-to-raw translation at read time.
// V3 converts ES offsets to raw file offsets at write time (simpler, smaller files).
// V4 is used for Blu-ray (TS packet structure makes V3 impractical) and for DVDs
// with LPCM audio (byte-swap pairs can straddle PES boundaries, requiring contiguous
// ES reads that only range maps provide). Non-LPCM DVDs use V3 for fastest reads.
var esConverters []source.ESRangeConverter
if index.UsesESOffsets && len(index.ESReaders) > 0 {
// Check if any matched entry uses LPCM (requires range maps for correct byte-swap).
hasLPCM := false
for _, e := range matchResult.Entries {
if e.IsLPCM {
hasLPCM = true
break
}
}
useRangeMaps := indexer.SourceType() == source.TypeBluray || hasLPCM
if useRangeMaps {
// V4: use range maps (preserves ES offsets in entries)
// Only include range maps for streams actually referenced by matched entries.
type streamKey struct {
fileIndex uint16
isVideo bool
audioSubStreamID byte
}
usedStreams := make(map[streamKey]bool)
for _, e := range matchResult.Entries {
if e.Source == 0 {
continue
}
usedStreams[streamKey{e.Source - 1, e.IsVideo, e.AudioSubStreamID}] = true
}
// Collect the set of file indices that need range maps
usedFiles := make(map[uint16]bool)
for k := range usedStreams {
usedFiles[k.fileIndex] = true
}
// Sort file indices for deterministic output.
sortedFiles := make([]uint16, 0, len(usedFiles))
for fi := range usedFiles {
sortedFiles = append(sortedFiles, fi)
}
sort.Slice(sortedFiles, func(i, j int) bool { return sortedFiles[i] < sortedFiles[j] })
var rangeMaps []dedup.RangeMapData
for _, fi := range sortedFiles {
i := int(fi)
if i >= len(index.ESReaders) {
continue
}
reader := index.ESReaders[i]
provider, ok := reader.(source.PESRangeProvider)
if !ok {
continue
}
rm := dedup.RangeMapData{
FileIndex: fi,
}
// Only include video range map if video entries reference this file
if usedStreams[streamKey{fi, true, 0}] {
rm.VideoRanges = provider.FilteredVideoRanges()
}
// If this reader provides offset conversion (e.g., ISO adapter),
// set the converter for range map encoding.
if adj, ok := reader.(source.FileOffsetAdjuster); ok {
rm.OffsetFunc = adj.FileOffsetConverter()
}
// Only include audio sub-stream range maps that are actually used
for _, subID := range provider.AudioSubStreams() {
if usedStreams[streamKey{fi, false, subID}] {
rm.AudioStreams = append(rm.AudioStreams, dedup.AudioRangeData{
SubStreamID: subID,
Ranges: provider.FilteredAudioRanges(subID),
})
}
}
rangeMaps = append(rangeMaps, rm)
}
if vw := verboseWriter(); vw != nil {
fmt.Fprintf(vw, " Range maps: %d/%d source files used, %d streams referenced\n",
len(usedFiles), len(index.ESReaders), len(usedStreams))
}
if len(rangeMaps) > 0 {
writer.SetRangeMaps(rangeMaps)
}
} else {
// V3: convert ES offsets to raw offsets for DVDs
esConverters = make([]source.ESRangeConverter, len(index.ESReaders))
for i, r := range index.ESReaders {
if converter, ok := r.(source.ESRangeConverter); ok {
esConverters[i] = converter
}
}
}
}
if err := writer.SetMatchResult(matchResult, esConverters); err != nil {
os.Remove(outputPath)
result.Err = fmt.Errorf("set match result: %w", err)
return result
}
// Pre-encode range maps (CPU-intensive) before the progress-tracked write.
rangeMapSize, err := writer.EncodeRangeMaps()
if err != nil {
os.Remove(outputPath)
result.Err = fmt.Errorf("encode range maps: %w", err)
return result
}
if rangeMapSize > 0 {
printInfo(" Range maps encoded: %s bytes\n", formatInt(rangeMapSize))
}
writeBar := newProgressBar(phaseLabel(2, "Writing dedup file..."), 0, "bytes")
if err := writer.WriteWithProgress(func(written, total int64) {
if writeBar.total == 0 && total > 0 {
writeBar.total = total
}
writeBar.Update(written)
}); err != nil {
writeBar.Cancel()
os.Remove(outputPath)
result.Err = fmt.Errorf("write dedup file: %w", err)
return result
}
writeBar.Finish()
// Write config file
configPath := outputPath + ".yaml"
if err := dedup.WriteConfig(configPath, virtualName, outputPath, sourceDir); err != nil {
printInfo(" Warning: failed to write config file: %v\n", err)
} else {
printInfo(" Config: %s\n", configPath)
}
// Verify reconstruction
verifyPrefix := phaseLabel(3, "Verifying reconstruction...")
outputPath = handleVerifyResult(outputPath, sourceDir, mkvPath, index, verifyPrefix, result)
// Populate result
result.MkvSize = parser.Size()
result.MatchedBytes = matchResult.MatchedBytes
result.UnmatchedBytes = matchResult.UnmatchedBytes
result.MatchedPackets = matchResult.MatchedPackets
result.TotalPackets = matchResult.TotalPackets
result.IndexEntries = len(matchResult.Entries)
dedupInfo, _ := os.Stat(outputPath)
if dedupInfo != nil {
result.DedupSize = dedupInfo.Size()
result.Savings = float64(result.MkvSize-result.DedupSize) / float64(result.MkvSize) * 100
}
result.Duration = time.Since(start)
return result
}
// handleVerifyResult runs post-write verification and handles failures.
// On failure: removes the config file, renames .mkvdup to .mkvdup.failed,
// and sets result.VerifyErr. Returns the (possibly updated) outputPath.
func handleVerifyResult(outputPath, sourceDir, mkvPath string, index *source.Index, phasePrefix string, result *createResult) string {
if err := verifyReconstructionFunc(outputPath, sourceDir, mkvPath, index, phasePrefix); err != nil {
printWarn(" ERROR: Verification failed: %v\n", err)
// Remove orphaned config file (it references the pre-rename path)
configPath := outputPath + ".yaml"
if rmErr := osRemove(configPath); rmErr != nil && !os.IsNotExist(rmErr) {
printWarn(" ERROR: Failed to remove config file %s: %v\n", configPath, rmErr)
} else if rmErr == nil {
printWarn(" Removed config file: %s\n", configPath)
}
// Rename broken file to .mkvdup.failed, overwriting any previous .failed file
failedPath := outputPath + ".failed"
if rmErr := osRemove(failedPath); rmErr != nil && !os.IsNotExist(rmErr) {
printWarn(" ERROR: Failed to remove existing failed file %s: %v\n", failedPath, rmErr)
}
if renameErr := osRename(outputPath, failedPath); renameErr != nil {
printWarn(" ERROR: Failed to rename broken file: %v\n", renameErr)
} else {
printWarn(" Renamed to: %s\n", failedPath)
outputPath = failedPath
result.OutputPath = failedPath
}
result.VerifyErr = err
}
return outputPath
}
// createDedup creates a .mkvdup file from an MKV and source directory.
func createDedup(mkvPath, sourceDir, outputPath, virtualName string, warnThreshold float64, nonInteractive bool) error {
totalStart := time.Now()
// Default virtual name
if virtualName == "" {
virtualName = filepath.Base(mkvPath)
}
// Ensure virtual name has .mkv extension
if !strings.HasSuffix(strings.ToLower(virtualName), ".mkv") {
virtualName += ".mkv"
}
printInfoln("Creating dedup file...")
printInfo(" MKV: %s\n", mkvPath)
printInfo(" Source: %s\n", sourceDir)
printInfo(" Output: %s\n", outputPath)
printInfoln()
// Phase 1: Quick codec compatibility check (only reads MKV track headers, not full file)
printInfo("Phase 1/6: Checking codec compatibility...")
codecParser, err := mkv.NewParser(mkvPath)
if err != nil {
return fmt.Errorf("open MKV: %w", err)
}
if err := codecParser.ParseTracksOnly(); err != nil {
// Fail open: this fast-path parser can't handle all MKV layouts.
// Log and continue without the pre-index codec compatibility check.
log.Printf("Warning: fast MKV track parsing failed for %q: %v; continuing without pre-index codec check", mkvPath, err)
codecParser.Close()
} else {
if err := checkCodecCompatibilityFromDir(codecParser.Tracks(), sourceDir, nonInteractive); err != nil {
codecParser.Close()
return err
}
codecParser.Close()
}
printInfoln(" done")
// Phase 2: Index source (expensive)
indexer, index, err := buildSourceIndex(sourceDir, "Phase 2/6: Building source index...")
if err != nil {
return err
}
defer index.Close()
// Phase 3-6: Process MKV (re-parses MKV, but parsing is fast relative to indexing)
result := createDedupWithIndex(mkvPath, sourceDir, outputPath, virtualName, indexer, index, 3, 6, nonInteractive, false)
if result.Err != nil {
return result.Err
}
// Summary
printInfoln()
printInfoln("=== Results ===")
printInfo("Total time: %v\n", time.Since(totalStart))
printInfoln()
printInfo("MKV file size: %s bytes (%.2f MB)\n", formatInt(result.MkvSize), float64(result.MkvSize)/(1024*1024))
printInfo("Matched bytes: %s bytes (%.2f MB, %.1f%%)\n",
formatInt(result.MatchedBytes), float64(result.MatchedBytes)/(1024*1024),
float64(result.MatchedBytes)/float64(result.MkvSize)*100)
printInfo("Delta (unmatched): %s bytes (%.2f MB, %.1f%%)\n",
formatInt(result.UnmatchedBytes), float64(result.UnmatchedBytes)/(1024*1024),
float64(result.UnmatchedBytes)/float64(result.MkvSize)*100)
printInfoln()
printInfo("Dedup file size: %s bytes (%.2f MB)\n", formatInt(result.DedupSize), float64(result.DedupSize)/(1024*1024))
printInfo("Space savings: %.1f%%\n", result.Savings)
printInfoln()
printInfo("Packets matched: %s / %s (%.1f%%)\n",
formatInt(int64(result.MatchedPackets)), formatInt(int64(result.TotalPackets)),
float64(result.MatchedPackets)/float64(result.TotalPackets)*100)
printInfo("Index entries: %s\n", formatInt(int64(result.IndexEntries)))
// Warning for low savings
if !quiet && result.Savings < warnThreshold {
printInfoln()
printInfo("WARNING: Space savings (%.1f%%) below %.0f%%\n", result.Savings, warnThreshold)
printInfoln(" This may indicate wrong source, transcoded MKV, or very small MKV file.")
}
if result.VerifyErr != nil {
return fmt.Errorf("verification failed: %w", result.VerifyErr)
}
return nil
}
package main
import (
"fmt"
"time"
"github.com/stuckj/mkvdup/internal/dedup"
"github.com/stuckj/mkvdup/internal/matcher"
"github.com/stuckj/mkvdup/internal/mkv"
"github.com/stuckj/mkvdup/internal/source"
)
func parseMKV(path string) error {
fmt.Printf("Parsing MKV file: %s\n", path)
parser, err := mkv.NewParser(path)
if err != nil {
return fmt.Errorf("create parser: %w", err)
}
defer parser.Close()
fmt.Printf("File size: %s bytes (%.2f GB)\n", formatInt(parser.Size()), float64(parser.Size())/(1024*1024*1024))
start := time.Now()
lastProgress := time.Now()
err = parser.Parse(func(processed, total int64) {
if time.Since(lastProgress) > 500*time.Millisecond {
pct := float64(processed) / float64(total) * 100
fmt.Printf("\rProgress: %.1f%% (%s / %s bytes)", pct, formatInt(processed), formatInt(total))
lastProgress = time.Now()
}
})
if err != nil {
return fmt.Errorf("parse: %w", err)
}
elapsed := time.Since(start)
fmt.Printf("\rProgress: 100.0%% - Complete \n")
fmt.Printf("Parse time: %v\n", elapsed)
fmt.Println()
fmt.Printf("Tracks: %d\n", len(parser.Tracks()))
for _, t := range parser.Tracks() {
typeStr := "unknown"
switch t.Type {
case mkv.TrackTypeVideo:
typeStr = "video"
case mkv.TrackTypeAudio:
typeStr = "audio"
case mkv.TrackTypeSubtitle:
typeStr = "subtitle"
}
extra := ""
if t.Type == mkv.TrackTypeVideo {
nalSize := matcher.NALLengthSizeForTrack(t.CodecID, t.CodecPrivate)
if nalSize > 0 {
extra = fmt.Sprintf(", NAL length: %d bytes (AVCC/HVCC)", nalSize)
} else {
extra = ", Annex B"
}
}
fmt.Printf(" Track %d: %s (codec: %s%s)\n", t.Number, typeStr, t.CodecID, extra)
}
fmt.Println()
fmt.Printf("Total packets: %d\n", parser.PacketCount())
fmt.Printf(" Video packets: %d\n", parser.VideoPacketCount())
fmt.Printf(" Audio packets: %d\n", parser.AudioPacketCount())
// Show some sample packets
packets := parser.Packets()
if len(packets) > 0 {
fmt.Println()
fmt.Println("Sample packets (first 5):")
for i := 0; i < 5 && i < len(packets); i++ {
p := packets[i]
fmt.Printf(" Packet %d: offset=%d, size=%d, track=%d, keyframe=%v\n",
i, p.Offset, p.Size, p.TrackNum, p.Keyframe)
}
}
return nil
}
func indexSource(dir string) error {
fmt.Printf("Indexing source directory: %s\n", dir)
indexer, err := source.NewIndexer(dir, source.DefaultWindowSize)
if err != nil {
return fmt.Errorf("create indexer: %w", err)
}
fmt.Printf("Source type: %s\n", indexer.SourceType())
start := time.Now()
lastProgress := time.Now()
err = indexer.Build(func(processed, total int64) {
if time.Since(lastProgress) > 500*time.Millisecond {
pct := float64(processed) / float64(total) * 100
fmt.Printf("\rProgress: %.1f%% (%s / %s bytes)", pct, formatInt(processed), formatInt(total))
lastProgress = time.Now()
}
})
if err != nil {
return fmt.Errorf("build index: %w", err)
}
elapsed := time.Since(start)
fmt.Printf("\rProgress: 100.0%% - Complete \n")
fmt.Printf("Index time: %v\n", elapsed)
fmt.Println()
index := indexer.Index()
defer index.Close()
fmt.Printf("Source files: %d\n", len(index.Files))
for _, f := range index.Files {
fmt.Printf(" %s: %s bytes\n", f.RelativePath, formatInt(f.Size))
}
fmt.Println()
fmt.Printf("Unique hashes: %d\n", len(index.HashToLocations))
if index.UsesESOffsets {
containerType := "MPEG-PS"
if indexer.SourceType() == source.TypeBluray {
containerType = "MPEG-TS"
}
fmt.Printf("Index type: ES-aware (%s)\n", containerType)
}
// Count total locations
totalLocations := 0
for _, locs := range index.HashToLocations {
totalLocations += len(locs)
}
fmt.Printf("Total indexed locations: %d\n", totalLocations)
return nil
}
func matchMKV(mkvPath, sourceDir string) error {
totalStart := time.Now()
// Phase 1: Parse MKV
parser, _, err := parseMKVWithProgress(mkvPath, "Phase 1/3: Parsing MKV file...")
if err != nil {
return err
}
defer parser.Close()
// Phase 2: Index source
_, index, err := buildSourceIndex(sourceDir, "Phase 2/3: Indexing source...")
if err != nil {
return err
}
defer index.Close()
// Phase 3: Match packets
fmt.Println("Phase 3/3: Matching packets...")
m, err := matcher.NewMatcher(index)
if err != nil {
return fmt.Errorf("create matcher: %w", err)
}
defer m.Close()
start := time.Now()
lastProgress := time.Now()
result, err := m.Match(mkvPath, parser.Packets(), parser.Tracks(), func(processed, total int) {
if time.Since(lastProgress) > 500*time.Millisecond {
pct := float64(processed) / float64(total) * 100
fmt.Printf("\r Progress: %.1f%% (%d/%d packets)", pct, processed, total)
lastProgress = time.Now()
}
})
if err != nil {
return fmt.Errorf("match: %w", err)
}
fmt.Printf("\r Matched in %v \n", time.Since(start))
// Summary
fmt.Println()
fmt.Println("=== Results ===")
fmt.Printf("Total time: %v\n", time.Since(totalStart))
fmt.Println()
mkvSize := parser.Size()
fmt.Printf("MKV file size: %s bytes (%.2f MB)\n", formatInt(mkvSize), float64(mkvSize)/(1024*1024))
fmt.Printf("Matched bytes: %s bytes (%.2f MB, %.1f%%)\n",
formatInt(result.MatchedBytes), float64(result.MatchedBytes)/(1024*1024),
float64(result.MatchedBytes)/float64(mkvSize)*100)
fmt.Printf("Delta (unmatched): %s bytes (%.2f MB, %.1f%%)\n",
formatInt(result.UnmatchedBytes), float64(result.UnmatchedBytes)/(1024*1024),
float64(result.UnmatchedBytes)/float64(mkvSize)*100)
fmt.Println()
fmt.Printf("Packets matched: %d / %d (%.1f%%)\n",
result.MatchedPackets, result.TotalPackets,
float64(result.MatchedPackets)/float64(result.TotalPackets)*100)
fmt.Printf("Index entries: %d\n", len(result.Entries))
fmt.Println()
// Storage savings (using actual format constants)
indexSize := int64(len(result.Entries) * dedup.EntrySize)
headerSize := int64(dedup.HeaderSize)
footerSize := int64(dedup.FooterSize)
totalDedupSize := headerSize + indexSize + int64(len(result.DeltaData)) + footerSize
// For Blu-ray sources, V4 format includes range map section (estimate)
rangeMapNote := ""
if index.UsesESOffsets {
// Range map is compressed; rough estimate is ~5-10% of index size
rangeMapEstimate := indexSize / 10
totalDedupSize += rangeMapEstimate
footerSize = int64(dedup.FooterV4Size)
rangeMapNote = fmt.Sprintf(" + ~%s range map", formatInt(rangeMapEstimate))
}
savings := float64(mkvSize-totalDedupSize) / float64(mkvSize) * 100
fmt.Printf("Estimated dedup file size:\n")
fmt.Printf(" Header: %s bytes\n", formatInt(headerSize))
fmt.Printf(" Index: %s bytes (%s entries × %d)\n", formatInt(indexSize), formatInt(int64(len(result.Entries))), dedup.EntrySize)
fmt.Printf(" Delta: %s bytes\n", formatInt(int64(len(result.DeltaData))))
fmt.Printf(" Footer: %s bytes\n", formatInt(footerSize))
fmt.Printf(" Total: ~%s bytes (%.2f MB)%s\n", formatInt(totalDedupSize), float64(totalDedupSize)/(1024*1024), rangeMapNote)
fmt.Printf(" Savings: ~%.1f%% reduction\n", savings)
return nil
}
package main
import (
"encoding/binary"
"fmt"
"sort"
"strings"
"github.com/stuckj/mkvdup/internal/dedup"
"github.com/stuckj/mkvdup/internal/matcher"
"github.com/stuckj/mkvdup/internal/mkv"
"github.com/stuckj/mkvdup/internal/mmap"
)
// deltaClass accumulates byte count and entry count for delta classification.
type deltaClass struct {
bytes int64
count int
}
// deltadiag analyzes delta (unmatched) entries in a .mkvdup file by
// cross-referencing with the original MKV to classify what stream type
// each delta region belongs to (video/audio/container).
func deltadiag(dedupPath, mkvPath string) error {
// Open dedup file
reader, err := dedup.NewReader(dedupPath, "")
if err != nil {
return fmt.Errorf("open dedup file: %w", err)
}
defer reader.Close()
entryCount := reader.EntryCount()
origSize := reader.OriginalSize()
printWarn("Dedup file: %d %s, original size %s bytes (%.2f MB)\n",
entryCount, plural(entryCount, "entry", "entries"), formatInt(origSize), float64(origSize)/(1024*1024))
// Parse MKV to get packet boundaries
printWarn("Parsing MKV file...\n")
mkvParser, err := mkv.NewParser(mkvPath)
if err != nil {
return fmt.Errorf("create MKV parser: %w", err)
}
defer mkvParser.Close()
if err := mkvParser.Parse(nil); err != nil {
return fmt.Errorf("parse MKV: %w", err)
}
packets := mkvParser.Packets()
tracks := mkvParser.Tracks()
printWarn(" %d packets, %d tracks\n", len(packets), len(tracks))
// Build track type map and detect AVCC NAL length size
trackTypes := make(map[int]int)
trackCodecs := make(map[int]string)
nalLenSizes := make(map[int]int)
isAVCTrack := make(map[int]bool)
for _, t := range tracks {
trackTypes[int(t.Number)] = t.Type
trackCodecs[int(t.Number)] = t.CodecID
nalLenSizes[int(t.Number)] = matcher.NALLengthSizeForTrack(t.CodecID, t.CodecPrivate)
if strings.HasPrefix(t.CodecID, "V_MPEG4/ISO/AVC") {
isAVCTrack[int(t.Number)] = true
}
}
// Memory-map MKV for reading delta bytes
mkvMmap, err := mmap.Open(mkvPath)
if err != nil {
return fmt.Errorf("mmap MKV: %w", err)
}
defer mkvMmap.Close()
mkvData := mkvMmap.Data()
// Sort packets by offset for binary search
sort.Slice(packets, func(i, j int) bool {
return packets[i].Offset < packets[j].Offset
})
// Classify each delta entry
printWarn("Classifying delta entries...\n")
var deltaVideo, deltaAudio, deltaContainer deltaClass
deltaAudioByCodec := make(map[string]*deltaClass)
var deltaVideoByNAL [32]deltaClass
var deltaVideoSliceSmall, deltaVideoSliceLarge deltaClass
for i := 0; i < entryCount; i++ {
ent, ok := reader.GetEntry(i)
if !ok {
continue
}
if ent.Source != 0 {
continue // Skip matched entries
}
entStart := ent.MkvOffset
entEnd := entStart + ent.Length
// Walk through the delta entry's byte range, classifying each portion
// based on which MKV packet (if any) it overlaps. A single delta entry
// can span multiple packets and container gaps when large unmatched
// regions (e.g., LPCM audio) create contiguous delta runs.
pos := entStart
for pos < entEnd {
pktIdx := deltadiagFindPacket(packets, pos)
if pktIdx < 0 {
// Not inside any packet — find the next packet start
nextPkt := deltadiagFindNextPacket(packets, pos)
var gapEnd int64
if nextPkt >= 0 && packets[nextPkt].Offset < entEnd {
gapEnd = packets[nextPkt].Offset
} else {
gapEnd = entEnd
}
gapBytes := gapEnd - pos
deltaContainer.bytes += gapBytes
deltaContainer.count++
pos = gapEnd
continue
}
pkt := packets[pktIdx]
pktEnd := pkt.Offset + pkt.Size
overlapEnd := entEnd
if overlapEnd > pktEnd {
overlapEnd = pktEnd
}
overlapBytes := overlapEnd - pos
ttype := trackTypes[int(pkt.TrackNum)]
if ttype == mkv.TrackTypeVideo {
deltaVideo.bytes += overlapBytes
deltaVideo.count++
// Parse AVCC NALs in the delta region
nalLenSize := nalLenSizes[int(pkt.TrackNum)]
if nalLenSize > 0 && isAVCTrack[int(pkt.TrackNum)] && overlapBytes >= int64(nalLenSize+1) {
deltaStart := pos
deltaEnd := overlapEnd
if deltaEnd <= int64(len(mkvData)) {
deltadiagClassifyAVCC(mkvData, pkt, nalLenSize, deltaStart, deltaEnd,
&deltaVideoByNAL, &deltaVideoSliceSmall, &deltaVideoSliceLarge)
}
}
} else if ttype == mkv.TrackTypeAudio {
deltaAudio.bytes += overlapBytes
deltaAudio.count++
codec := trackCodecs[int(pkt.TrackNum)]
if codec == "" {
codec = "unknown"
}
dc := deltaAudioByCodec[codec]
if dc == nil {
dc = &deltaClass{}
deltaAudioByCodec[codec] = dc
}
dc.bytes += overlapBytes
dc.count++
} else {
deltaContainer.bytes += overlapBytes
deltaContainer.count++
}
pos = overlapEnd
}
}
// Print results
totalDelta := deltaVideo.bytes + deltaAudio.bytes + deltaContainer.bytes
if totalDelta == 0 {
fmt.Printf("\nNo delta entries found (100%% matched).\n")
return nil
}
fmt.Printf("\n=== Delta Classification ===\n")
fmt.Printf("Total delta: %s bytes (%.2f MB)\n\n", formatInt(totalDelta), float64(totalDelta)/(1024*1024))
fmt.Printf("Video delta: %12s bytes (%8.2f MB) [%6d entries] (%.1f%% of delta)\n",
formatInt(deltaVideo.bytes), float64(deltaVideo.bytes)/(1024*1024), deltaVideo.count,
float64(deltaVideo.bytes)/float64(totalDelta)*100)
fmt.Printf("Audio delta: %12s bytes (%8.2f MB) [%6d entries] (%.1f%% of delta)\n",
formatInt(deltaAudio.bytes), float64(deltaAudio.bytes)/(1024*1024), deltaAudio.count,
float64(deltaAudio.bytes)/float64(totalDelta)*100)
fmt.Printf("Container delta: %12s bytes (%8.2f MB) [%6d entries] (%.1f%% of delta)\n",
formatInt(deltaContainer.bytes), float64(deltaContainer.bytes)/(1024*1024), deltaContainer.count,
float64(deltaContainer.bytes)/float64(totalDelta)*100)
// Audio codec breakdown
if len(deltaAudioByCodec) > 0 {
fmt.Printf("\n=== Audio Delta by Codec ===\n")
codecs := make([]string, 0, len(deltaAudioByCodec))
for codec := range deltaAudioByCodec {
codecs = append(codecs, codec)
}
sort.Strings(codecs)
for _, codec := range codecs {
dc := deltaAudioByCodec[codec]
fmt.Printf(" %-20s: %10s bytes (%8.2f MB) [%6d entries]\n",
codec, formatInt(dc.bytes), float64(dc.bytes)/(1024*1024), dc.count)
}
}
// Video NAL type breakdown
nalTypeNames := map[int]string{
1: "non-IDR slice", 2: "slice A", 3: "slice B", 4: "slice C",
5: "IDR slice", 6: "SEI", 7: "SPS", 8: "PPS", 9: "AUD", 12: "filler",
}
hasNALBreakdown := false
for i := 0; i < 32; i++ {
if deltaVideoByNAL[i].count > 0 {
hasNALBreakdown = true
break
}
}
if hasNALBreakdown {
fmt.Printf("\n=== Video Delta by H.264 NAL Type ===\n")
for i := 0; i < 32; i++ {
if deltaVideoByNAL[i].count == 0 {
continue
}
name := nalTypeNames[i]
if name == "" {
name = fmt.Sprintf("type %d", i)
}
fmt.Printf(" %-14s: %10s bytes (%8.2f MB) [%6d NALs]\n",
name, formatInt(deltaVideoByNAL[i].bytes),
float64(deltaVideoByNAL[i].bytes)/(1024*1024),
deltaVideoByNAL[i].count)
}
fmt.Printf("\n=== Video Slice Delta Size Breakdown ===\n")
fmt.Printf(" Slice NALs < 4KB: %10s bytes (%8.2f MB) [%6d NALs]\n",
formatInt(deltaVideoSliceSmall.bytes), float64(deltaVideoSliceSmall.bytes)/(1024*1024),
deltaVideoSliceSmall.count)
fmt.Printf(" Slice NALs >= 4KB: %10s bytes (%8.2f MB) [%6d NALs]\n",
formatInt(deltaVideoSliceLarge.bytes), float64(deltaVideoSliceLarge.bytes)/(1024*1024),
deltaVideoSliceLarge.count)
}
// Summary
fmt.Printf("\n=== Summary ===\n")
fmt.Printf("Original file: %.2f MB\n", float64(origSize)/(1024*1024))
fmt.Printf("Total delta: %.2f MB (%.1f%% of original)\n",
float64(totalDelta)/(1024*1024), float64(totalDelta)/float64(origSize)*100)
fmt.Printf(" Video delta: %.2f MB (%.1f%% of delta)\n",
float64(deltaVideo.bytes)/(1024*1024), float64(deltaVideo.bytes)/float64(totalDelta)*100)
fmt.Printf(" Audio delta: %.2f MB (%.1f%% of delta)\n",
float64(deltaAudio.bytes)/(1024*1024), float64(deltaAudio.bytes)/float64(totalDelta)*100)
fmt.Printf(" Container: %.2f MB (%.1f%% of delta)\n",
float64(deltaContainer.bytes)/(1024*1024), float64(deltaContainer.bytes)/float64(totalDelta)*100)
return nil
}
// deltadiagFindPacket finds the packet containing the given offset using binary search.
func deltadiagFindPacket(packets []mkv.Packet, offset int64) int {
low, high := 0, len(packets)-1
for low <= high {
mid := (low + high) / 2
pkt := packets[mid]
if offset < pkt.Offset {
high = mid - 1
} else if offset >= pkt.Offset+pkt.Size {
low = mid + 1
} else {
return mid
}
}
return -1
}
// deltadiagFindNextPacket finds the first packet starting at or after the given offset.
func deltadiagFindNextPacket(packets []mkv.Packet, offset int64) int {
low, high := 0, len(packets)-1
result := -1
for low <= high {
mid := (low + high) / 2
if packets[mid].Offset >= offset {
result = mid
high = mid - 1
} else {
low = mid + 1
}
}
return result
}
// deltadiagClassifyAVCC parses AVCC NAL units within a packet to classify which
// NAL types fall within the delta region [deltaStart, deltaEnd).
func deltadiagClassifyAVCC(mkvData []byte, pkt mkv.Packet, nalLenSize int,
deltaStart, deltaEnd int64,
byNAL *[32]deltaClass, sliceSmall, sliceLarge *deltaClass) {
pktEnd := pkt.Offset + pkt.Size
if pktEnd > int64(len(mkvData)) {
pktEnd = int64(len(mkvData))
}
pktData := mkvData[pkt.Offset:pktEnd]
pos := 0
for pos+nalLenSize < len(pktData) {
var nalLen uint32
switch nalLenSize {
case 4:
nalLen = binary.BigEndian.Uint32(pktData[pos:])
case 2:
nalLen = uint32(binary.BigEndian.Uint16(pktData[pos:]))
case 1:
nalLen = uint32(pktData[pos])
}
nalDataStart := pkt.Offset + int64(pos+nalLenSize)
nalDataEnd := nalDataStart + int64(nalLen)
if nalLen == 0 || nalDataEnd > pktEnd {
break
}
nalFullStart := pkt.Offset + int64(pos)
// Check overlap with delta region
overlapStart := nalFullStart
if overlapStart < deltaStart {
overlapStart = deltaStart
}
overlapEnd := nalDataEnd
if overlapEnd > deltaEnd {
overlapEnd = deltaEnd
}
if overlapStart < overlapEnd {
overlapBytes := overlapEnd - overlapStart
if nalDataStart < int64(len(mkvData)) {
nalType := mkvData[nalDataStart] & 0x1F
byNAL[nalType].bytes += overlapBytes
byNAL[nalType].count++
if nalType == 1 || nalType == 5 {
if nalLen >= 4096 {
sliceLarge.bytes += overlapBytes
sliceLarge.count++
} else {
sliceSmall.bytes += overlapBytes
sliceSmall.count++
}
}
}
}
pos = int(nalDataEnd - pkt.Offset)
if pos <= 0 {
break
}
}
}
package main
import (
"fmt"
"os"
"path/filepath"
"strings"
"time"
"github.com/stuckj/mkvdup/internal/dedup"
)
// expandConfigCmd reads a config file (same format as mount/validate), resolves
// all includes glob patterns to explicit paths, and writes an expanded config
// that is semantically equivalent but with no wildcard patterns.
func expandConfigCmd(configPath string, outputPath string, dryRun bool) error {
expanded, err := dedup.ExpandConfigFile(configPath)
if err != nil {
return err
}
// Add a generation header comment.
absConfigPath, err := filepath.Abs(configPath)
if err != nil {
absConfigPath = configPath
}
var sb strings.Builder
fmt.Fprintf(&sb, "# Auto-generated by: mkvdup expand-config\n")
fmt.Fprintf(&sb, "# Source: %s\n", absConfigPath)
fmt.Fprintf(&sb, "# Generated: %s\n", time.Now().UTC().Format(time.RFC3339))
sb.Write(expanded)
output := sb.String()
if dryRun {
fmt.Print(output)
return nil
}
if outputPath == "" {
fmt.Print(output)
return nil
}
// Skip rewrite if the YAML content (non-comment lines) is unchanged,
// to avoid triggering unnecessary mount reloads.
if existing, err := os.ReadFile(outputPath); err == nil {
if yamlContent(string(existing)) == yamlContent(output) {
if !quiet {
fmt.Fprintf(os.Stderr, "No changes to %s\n", outputPath)
}
return nil
}
}
if err := os.WriteFile(outputPath, []byte(output), 0644); err != nil {
return fmt.Errorf("write output file: %w", err)
}
absOutput, err := filepath.Abs(outputPath)
if err != nil || absOutput == "" {
absOutput = outputPath
}
if !quiet {
fmt.Fprintf(os.Stderr, "Wrote %s\n", absOutput)
}
return nil
}
// yamlContent extracts the non-comment lines from a YAML string for comparison.
// This allows the header comments (timestamp, etc.) to change without triggering
// a rewrite when the actual config content is unchanged.
func yamlContent(s string) string {
var lines []string
for _, line := range strings.Split(s, "\n") {
if !strings.HasPrefix(line, "#") {
lines = append(lines, line)
}
}
return strings.Join(lines, "\n")
}
package main
import (
"os"
"strconv"
)
// formatInt formats an integer with thousands separators (e.g., 1234567 → "1,234,567").
func formatInt(n int64) string {
s := strconv.FormatInt(n, 10)
if len(s) <= 3 {
return s
}
// Insert commas from the right
var result []byte
for i, c := range s {
if i > 0 && (len(s)-i)%3 == 0 {
result = append(result, ',')
}
result = append(result, byte(c))
}
return string(result)
}
// plural returns singular when n == 1, plural otherwise.
// Example: plural(n, "file", "files")
func plural(n int, singular, pl string) string {
if n == 1 {
return singular
}
return pl
}
// isTerminal returns true if stdin is a terminal (not piped).
func isTerminal() bool {
fi, err := os.Stdin.Stat()
if err != nil {
return false
}
return fi.Mode()&os.ModeCharDevice != 0
}
package main
import (
"fmt"
"io"
"os"
"path/filepath"
"strings"
"time"
"github.com/cespare/xxhash/v2"
"github.com/stuckj/mkvdup/internal/dedup"
)
// showInfo displays information about a dedup file.
func showInfo(dedupPath string, hideUnused bool) error {
reader, err := dedup.NewReader(dedupPath, "")
if err != nil {
return fmt.Errorf("open dedup file: %w", err)
}
defer reader.Close()
info := reader.Info()
fmt.Printf("Dedup file: %s\n", dedupPath)
fmt.Println()
creatorVersion := info["creator_version"].(string)
if creatorVersion != "" {
fmt.Printf("Created by: %s\n", creatorVersion)
} else {
fmt.Printf("Created by: unknown (pre-0.9.0)\n")
}
fmt.Printf("Format version: %d\n", info["version"].(uint32))
fmt.Printf("Original MKV size: %s bytes (%.2f MB)\n",
formatInt(info["original_size"].(int64)),
float64(info["original_size"].(int64))/(1024*1024))
fmt.Printf("Original checksum: %016x\n", info["original_checksum"].(uint64))
fmt.Println()
sourceType := "Unknown"
switch info["source_type"].(uint8) {
case 0:
sourceType = "DVD"
case 1:
sourceType = "Blu-ray"
}
fmt.Printf("Source type: %s\n", sourceType)
fmt.Printf("Uses ES offsets: %v\n", info["uses_es_offsets"].(bool))
if info["has_range_maps"].(bool) {
fmt.Printf("Has range maps: true\n")
}
fmt.Printf("Source file count: %d\n", info["source_file_count"].(int))
fmt.Printf("Index entry count: %d\n", info["entry_count"].(int))
fmt.Printf("Delta size: %s bytes (%.2f MB)\n",
formatInt(info["delta_size"].(int64)),
float64(info["delta_size"].(int64))/(1024*1024))
fmt.Println()
// Source files
fmt.Println("Source files:")
hasUsedFlags := reader.HasSourceUsedFlags()
for _, sf := range reader.SourceFiles() {
if hideUnused && hasUsedFlags && !sf.Used {
continue
}
suffix := ""
if hasUsedFlags && !sf.Used {
suffix = " (unused)"
}
fmt.Printf(" %s (%s bytes)%s\n", sf.RelativePath, formatInt(sf.Size), suffix)
}
return nil
}
// calculateFileChecksum calculates xxhash checksum of a file.
func calculateFileChecksum(path string) (uint64, error) {
return calculateFileChecksumWithProgress(path, 0, "")
}
// calculateFileChecksumWithProgress calculates xxhash checksum of a file,
// showing inline progress when expectedSize > 0.
func calculateFileChecksumWithProgress(path string, expectedSize int64, displayName string) (uint64, error) {
f, err := os.Open(path)
if err != nil {
return 0, err
}
defer f.Close()
hasher := xxhash.New()
showProgress := expectedSize > 0
if !showProgress {
if _, err := io.Copy(hasher, f); err != nil {
return 0, err
}
return hasher.Sum64(), nil
}
buf := make([]byte, 4*1024*1024) // 4MB buffer
var processed int64
lastProgress := time.Time{}
for {
n, err := f.Read(buf)
if n > 0 {
if _, werr := hasher.Write(buf[:n]); werr != nil {
return 0, werr
}
processed += int64(n)
if time.Since(lastProgress) > 500*time.Millisecond {
pct := float64(processed) / float64(expectedSize) * 100
fmt.Printf("\r Verifying %s... %.1f%%", displayName, pct)
lastProgress = time.Now()
}
}
if err == io.EOF {
break
}
if err != nil {
return 0, err
}
}
// Clear progress line
progressText := fmt.Sprintf(" Verifying %s... 100.0%%", displayName)
fmt.Printf("\r%s\r", strings.Repeat(" ", len(progressText)))
return hasher.Sum64(), nil
}
// checkDedup checks the integrity of a dedup file and its source files.
func checkDedup(dedupPath, sourceDir string, sourceChecksums bool) error {
fmt.Printf("Checking dedup file: %s\n", dedupPath)
fmt.Printf("Source directory: %s\n", sourceDir)
fmt.Println()
// Phase 1: Open and verify dedup file integrity
reader, err := dedup.NewReader(dedupPath, sourceDir)
if err != nil {
return fmt.Errorf("open dedup file: %w", err)
}
defer reader.Close()
fmt.Print("Checking dedup file integrity...")
if err := reader.VerifyIntegrity(); err != nil {
fmt.Println(" FAILED")
return fmt.Errorf("integrity check: %w", err)
}
fmt.Println(" OK")
// Phase 2: Check source files exist with correct sizes
sourceFiles := reader.SourceFiles()
fmt.Printf("\nChecking source files (%d %s)...\n", len(sourceFiles), plural(len(sourceFiles), "file", "files"))
errCount := 0
for _, sf := range sourceFiles {
sfPath := filepath.Join(sourceDir, sf.RelativePath)
stat, err := os.Stat(sfPath)
if err != nil {
fmt.Printf(" FAILED %s: %v\n", sf.RelativePath, err)
errCount++
continue
}
if stat.Size() != sf.Size {
fmt.Printf(" FAILED %s: size mismatch (expected %s, got %s)\n",
sf.RelativePath, formatInt(sf.Size), formatInt(stat.Size()))
errCount++
continue
}
fmt.Printf(" OK %s (%s bytes)\n", sf.RelativePath, formatInt(sf.Size))
}
// Phase 3: Optionally verify source file checksums
if sourceChecksums {
if errCount > 0 {
fmt.Println("\nSkipping source checksum verification due to earlier errors")
} else {
fmt.Printf("\nVerifying source file checksums...\n")
for _, sf := range sourceFiles {
sfPath := filepath.Join(sourceDir, sf.RelativePath)
checksum, err := calculateFileChecksumWithProgress(sfPath, sf.Size, sf.RelativePath)
if err != nil {
fmt.Printf(" FAILED %s: %v\n", sf.RelativePath, err)
errCount++
continue
}
if checksum != sf.Checksum {
fmt.Printf(" FAILED %s: checksum mismatch (expected %016x, got %016x)\n",
sf.RelativePath, sf.Checksum, checksum)
errCount++
continue
}
fmt.Printf(" OK %s\n", sf.RelativePath)
}
}
}
// Final summary
fmt.Println()
if errCount > 0 {
return fmt.Errorf("check FAILED: %d %s found", errCount, plural(errCount, "error", "errors"))
}
fmt.Println("Check PASSED")
return nil
}
package main
import (
"fmt"
"log"
"log/syslog"
"os"
"os/signal"
"path/filepath"
"sync"
"syscall"
"github.com/hanwen/go-fuse/v2/fs"
"github.com/hanwen/go-fuse/v2/fuse"
"github.com/stuckj/mkvdup/internal/daemon"
"github.com/stuckj/mkvdup/internal/dedup"
mkvfuse "github.com/stuckj/mkvdup/internal/fuse"
)
// defaultConfigPath is the default config file location.
const defaultConfigPath = "/etc/mkvdup.conf"
// expandConfigDir expands a directory path to a list of .yaml/.yml files it contains.
func expandConfigDir(dir string) ([]string, error) {
entries, err := os.ReadDir(dir)
if err != nil {
return nil, fmt.Errorf("read config directory %s: %w", dir, err)
}
var paths []string
for _, entry := range entries {
if !entry.IsDir() && (filepath.Ext(entry.Name()) == ".yaml" || filepath.Ext(entry.Name()) == ".yml") {
paths = append(paths, filepath.Join(dir, entry.Name()))
}
}
if len(paths) == 0 {
return nil, fmt.Errorf("no YAML files (.yaml, .yml) found in %s", dir)
}
return paths, nil
}
// mountFuse mounts a FUSE filesystem exposing dedup files as MKV files.
func mountFuse(mountpoint string, configPaths []string, opts MountOptions) error {
// Daemonize unless --foreground is set or we're already a daemon child
if !opts.Foreground && !daemon.IsChild() {
return daemon.Daemonize(opts.PidFile, opts.DaemonTimeout)
}
// Write PID file in foreground mode (daemon mode writes it in Daemonize)
if opts.Foreground && opts.PidFile != "" {
if err := daemon.WritePidFile(opts.PidFile, os.Getpid()); err != nil {
return fmt.Errorf("write pid file: %w", err)
}
}
// Clean up PID file on exit (for both foreground and daemon child modes)
if opts.PidFile != "" && (opts.Foreground || daemon.IsChild()) {
defer func() {
_ = daemon.RemovePidFile(opts.PidFile)
}()
}
// If no config paths provided, use default
if len(configPaths) == 0 {
if _, err := os.Stat(defaultConfigPath); err == nil {
configPaths = []string{defaultConfigPath}
} else {
if daemon.IsChild() {
daemon.NotifyError(fmt.Errorf("no config files specified and %s not found", defaultConfigPath))
}
return fmt.Errorf("no config files specified and %s not found", defaultConfigPath)
}
}
// Store the config-dir path for SIGHUP re-expansion
var configDirPath string
if opts.ConfigDir {
configDirPath = configPaths[0]
}
// If configDir is set, expand directory to list of .yaml files
if opts.ConfigDir {
if len(configPaths) != 1 {
err := fmt.Errorf("--config-dir requires exactly one directory path, got %d", len(configPaths))
if daemon.IsChild() {
daemon.NotifyError(err)
}
return err
}
expanded, err := expandConfigDir(configPaths[0])
if err != nil {
if daemon.IsChild() {
daemon.NotifyError(err)
}
return err
}
configPaths = expanded
}
// Set up permission store
defaults := mkvfuse.Defaults{
FileUID: opts.DefaultUID,
FileGID: opts.DefaultGID,
FileMode: opts.DefaultFileMode,
DirUID: opts.DefaultUID,
DirGID: opts.DefaultGID,
DirMode: opts.DefaultDirMode,
}
permPath := mkvfuse.ResolvePermissionsPath(opts.PermissionsFile)
permStore := mkvfuse.NewPermissionStore(permPath, defaults, verbose)
if err := permStore.Load(); err != nil {
if daemon.IsChild() {
daemon.NotifyError(fmt.Errorf("load permissions: %w", err))
}
return fmt.Errorf("load permissions: %w", err)
}
// Resolve configs (expands includes, globs, virtual_files) and extract
// on_error_command (first-wins across all config files).
configs, errorCmdConfig, loadedConfigPaths, err := dedup.ResolveConfigs(configPaths)
if err != nil {
err = fmt.Errorf("resolve configs: %w", err)
if daemon.IsChild() {
daemon.NotifyError(err)
}
return err
}
opts.OnErrorCommand = errorCmdConfig
// Create the root filesystem
root, err := mkvfuse.NewMKVFSFromConfigs(configs, verbose, &mkvfuse.DefaultReaderFactory{ReadTimeout: opts.SourceReadTimeout}, permStore)
if err != nil {
err = fmt.Errorf("create filesystem: %w", err)
if daemon.IsChild() {
daemon.NotifyError(err)
}
return err
}
// Mount the filesystem
fuseOpts := &fs.Options{
MountOptions: fuse.MountOptions{
AllowOther: opts.AllowOther,
Name: "mkvdup",
FsName: "mkvdup",
MaxWrite: 1 << 20, // 1MB max read/write; go-fuse sets max_read = MaxWrite
// Enable kernel permission checks for standard Unix semantics.
// This properly handles supplementary groups and matches behavior
// of real filesystems (ext4, XFS, btrfs, etc.).
Options: []string{"default_permissions"},
},
}
server, err := fs.Mount(mountpoint, root, fuseOpts)
if err != nil {
err = fmt.Errorf("mount: %w", err)
if daemon.IsChild() {
daemon.NotifyError(err)
}
return err
}
// Wait for mount to be ready
server.WaitMount()
// Enable FUSE kernel notifications (NotifyDelete, NotifyEntry, etc.)
// now that the go-fuse bridge is initialized.
root.SetMounted()
// In daemon mode, redirect log output to syslog before starting watchers
// so that all log.Printf calls (from watchers, doReload, BuildDirectoryTree)
// go to syslog. Must happen before daemon.Detach() which redirects stderr
// to /dev/null.
if daemon.IsChild() {
if w, err := syslog.New(syslog.LOG_INFO|syslog.LOG_DAEMON, "mkvdup"); err == nil {
log.SetOutput(w)
log.SetFlags(0) // syslog adds its own timestamp
defer w.Close()
}
}
// Set up source file watcher (monitors source files for changes)
var sourceWatcher *mkvfuse.SourceWatcher
if !opts.NoSourceWatch {
// Closure over log.Printf: syslog setup above redirects the default
// logger's output, so the watcher automatically picks it up.
watchLogFn := func(format string, args ...interface{}) {
log.Printf(format, args...)
}
var err error
sourceWatcher, err = mkvfuse.NewSourceWatcher(opts.OnSourceChange, opts.SourceWatchPollInterval, opts.OnErrorCommand, watchLogFn)
if err != nil {
log.Printf("source-watch: warning: failed to create watcher: %v", err)
} else {
sourceWatcher.Update(root.Files(), &mkvfuse.DefaultReaderFactory{ReadTimeout: opts.SourceReadTimeout})
sourceWatcher.Start()
}
}
// Declare configWatcher before doReload so the closure can reference it.
// Initialized below after doReload is defined.
var configWatcher *mkvfuse.ConfigWatcher
// doReload performs a config reload. Called by the SIGHUP handler and
// the config file watcher callback. Serialized by reloadMu to prevent
// concurrent reloads from racing on root.Reload() and watcher updates.
// Uses log.Printf which is redirected to syslog in daemon mode (see
// log.SetOutput above).
var reloadMu sync.Mutex
doReload := func() {
reloadMu.Lock()
defer reloadMu.Unlock()
log.Printf("reloading config...")
// Re-expand config-dir if applicable
var reloadPaths []string
if configDirPath != "" {
expanded, err := expandConfigDir(configDirPath)
if err != nil {
log.Printf("reload failed: expand config dir: %v", err)
return
}
reloadPaths = expanded
} else {
reloadPaths = configPaths
}
// Resolve configs (expands includes, globs, virtual_files)
configs, _, newConfigPaths, err := dedup.ResolveConfigs(reloadPaths)
if err != nil {
log.Printf("reload failed: resolve configs: %v", err)
return
}
// Reload the filesystem
if err := root.Reload(configs, func(format string, args ...interface{}) {
log.Printf(format, args...)
}); err != nil {
log.Printf("reload failed: %v", err)
return
}
// Update source watcher with new file set
if sourceWatcher != nil {
sourceWatcher.Update(root.Files(), &mkvfuse.DefaultReaderFactory{ReadTimeout: opts.SourceReadTimeout})
}
// Update config watcher with new config file set
if configWatcher != nil {
configWatcher.Update(newConfigPaths)
}
log.Printf("config reloaded successfully")
}
// Set up config file watcher (monitors config files for changes)
if !opts.NoConfigWatch {
watchLogFn := func(format string, args ...interface{}) {
log.Printf(format, args...)
}
var err error
configWatcher, err = mkvfuse.NewConfigWatcher(opts.OnConfigChange, opts.SourceWatchPollInterval, doReload, watchLogFn)
if err != nil {
log.Printf("config-watch: warning: failed to create watcher: %v", err)
} else {
configWatcher.Update(loadedConfigPaths)
configWatcher.Start()
}
}
// If we're a daemon child, signal success and detach from terminal
if daemon.IsChild() {
if err := daemon.NotifyReady(); err != nil {
// Parent may have timed out; log and continue since mount succeeded
printWarn("warning: failed to notify parent: %v\n", err)
}
daemon.Detach()
} else {
// Running in foreground mode - print info
fmt.Printf("Mounted at %s\n", mountpoint)
fmt.Printf("Files:\n")
for _, configPath := range configPaths {
config, _ := dedup.ReadConfig(configPath)
if config != nil {
fmt.Printf(" %s\n", config.Name)
}
}
fmt.Println()
fmt.Println("Press Ctrl+C to unmount")
}
// Handle signals for graceful shutdown and config reload
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP)
go func() {
for sig := range sigChan {
switch sig {
case syscall.SIGHUP:
doReload()
case syscall.SIGINT, syscall.SIGTERM:
if !daemon.IsChild() {
fmt.Println("\nUnmounting...")
}
server.Unmount()
return
}
}
}()
// Serve until unmounted
server.Wait()
// Stop watchers
if configWatcher != nil {
configWatcher.Stop()
}
if sourceWatcher != nil {
sourceWatcher.Stop()
}
if !daemon.IsChild() {
fmt.Println("Unmounted")
}
return nil
}
// reloadDaemon validates config files and sends SIGHUP to the running daemon.
func reloadDaemon(pid int, configPaths []string, configDir bool) error {
// Verify the process exists (on Unix, FindProcess always succeeds;
// send signal 0 to check if process is actually running)
process, err := os.FindProcess(pid)
if err != nil {
return fmt.Errorf("find process %d: %w", pid, err)
}
if err := process.Signal(syscall.Signal(0)); err != nil {
return fmt.Errorf("daemon process %d is not running: %w", pid, err)
}
// Validate config if paths provided
if len(configPaths) > 0 {
resolved, err := resolveConfigPaths(configPaths, configDir)
if err != nil {
return fmt.Errorf("resolve config paths: %w", err)
}
fmt.Println("Validating configuration...")
allEntries, _, hasErrors := validateConfigEntries(resolved)
nameErrors, _ := checkNameConflicts(allEntries)
if hasErrors || nameErrors {
return fmt.Errorf("config validation failed, not sending reload signal")
}
fmt.Println("Configuration valid.")
fmt.Println()
}
// Send SIGHUP to the daemon
fmt.Printf("Sending SIGHUP to daemon (pid %d)...\n", pid)
if err := process.Signal(syscall.SIGHUP); err != nil {
return fmt.Errorf("send SIGHUP to process %d: %w", pid, err)
}
fmt.Println("Reload signal sent successfully.")
return nil
}
// resolveConfigPaths expands --config-dir and applies defaults to get the final
// list of config file paths to validate.
func resolveConfigPaths(configPaths []string, configDir bool) ([]string, error) {
if configDir {
if len(configPaths) != 1 {
return nil, fmt.Errorf("--config-dir requires exactly one directory path, got %d", len(configPaths))
}
return expandConfigDir(configPaths[0])
}
if len(configPaths) == 0 {
return nil, fmt.Errorf("no config files specified\nRun 'mkvdup validate --help' for usage")
}
return configPaths, nil
}
package main
import (
"fmt"
"os"
"path/filepath"
"sort"
"github.com/stuckj/mkvdup/internal/matcher"
"github.com/stuckj/mkvdup/internal/mkv"
"github.com/stuckj/mkvdup/internal/source"
)
// ProbeResult represents the result of probing a source against an MKV.
type ProbeResult struct {
MKVPath string
SourcePath string
MatchCount int
TotalSamples int
MatchPercent float64
}
// mkvProbeData holds the pre-computed probe hashes for a single MKV file.
type mkvProbeData struct {
Path string
HashCount int
ProbeHashes []matcher.ProbeHash
Error string // non-empty if MKV could not be parsed
}
// probe tests if one or more MKV files match one or more source directories.
// When multiple MKVs are provided, each source is indexed only once and all
// MKV hash sets are checked against it, making multi-MKV probing much faster.
func probe(mkvPaths []string, sourceDirs []string) error {
fmt.Printf("Probing %d MKV(s) against %d source(s)...\n", len(mkvPaths), len(sourceDirs))
fmt.Println()
windowSize := source.DefaultWindowSize
// Phase 1: Parse all MKVs and compute probe hashes
mkvData := make([]mkvProbeData, 0, len(mkvPaths))
for i, mkvPath := range mkvPaths {
if len(mkvPaths) > 1 {
fmt.Printf("[%d/%d] Parsing %s...\n", i+1, len(mkvPaths), filepath.Base(mkvPath))
} else {
fmt.Printf("Parsing %s...\n", filepath.Base(mkvPath))
}
hashes, err := computeProbeHashes(mkvPath, windowSize)
if err != nil {
fmt.Printf(" Error: %v\n", err)
mkvData = append(mkvData, mkvProbeData{
Path: mkvPath,
Error: err.Error(),
})
continue
}
fmt.Printf(" Computed %d probe hashes\n", len(hashes))
mkvData = append(mkvData, mkvProbeData{
Path: mkvPath,
HashCount: len(hashes),
ProbeHashes: hashes,
})
}
fmt.Println()
// Phase 2: For each source, index once and check all MKV hash sets
// results[mkvIdx] = []ProbeResult for that MKV
results := make([][]ProbeResult, len(mkvData))
for i := range results {
results[i] = make([]ProbeResult, 0, len(sourceDirs))
}
for _, sourceDir := range sourceDirs {
fmt.Printf("Indexing source: %s...\n", sourceDir)
indexer, err := source.NewIndexer(sourceDir, windowSize)
if err != nil {
fmt.Printf(" Error: %v\n", err)
for i, md := range mkvData {
if md.Error != "" {
continue
}
results[i] = append(results[i], ProbeResult{
MKVPath: md.Path,
SourcePath: sourceDir,
TotalSamples: md.HashCount,
})
}
continue
}
indexer.SetVerboseWriter(verboseWriter())
if err := indexer.Build(nil); err != nil {
fmt.Printf(" Error building index: %v\n", err)
for i, md := range mkvData {
if md.Error != "" {
continue
}
results[i] = append(results[i], ProbeResult{
MKVPath: md.Path,
SourcePath: sourceDir,
TotalSamples: md.HashCount,
})
}
continue
}
index := indexer.Index()
// Check each MKV's hashes against this source
for i, md := range mkvData {
if md.Error != "" {
continue
}
matchCount := 0
for _, ph := range md.ProbeHashes {
if locs, ok := index.HashToLocations[ph.Hash]; ok {
if index.UsesESOffsets {
for _, loc := range locs {
if loc.IsVideo == ph.IsVideo {
matchCount++
break
}
}
} else if len(locs) > 0 {
matchCount++
}
}
}
matchPercent := float64(matchCount) / float64(md.HashCount) * 100
results[i] = append(results[i], ProbeResult{
MKVPath: md.Path,
SourcePath: sourceDir,
MatchCount: matchCount,
TotalSamples: md.HashCount,
MatchPercent: matchPercent,
})
if len(mkvPaths) > 1 {
fmt.Printf(" %s: %d/%d (%.0f%%)\n",
filepath.Base(md.Path), matchCount, md.HashCount, matchPercent)
} else {
fmt.Printf(" Matched %d/%d hashes (%.0f%%)\n",
matchCount, md.HashCount, matchPercent)
}
}
index.Close()
}
// Phase 3: Print results
fmt.Println()
fmt.Println("=== Results ===")
for i, md := range mkvData {
if md.Error != "" {
fmt.Printf("\n %s: ERROR: %s\n", filepath.Base(md.Path), md.Error)
continue
}
if len(mkvPaths) > 1 {
fmt.Printf("\n %s:\n", filepath.Base(md.Path))
} else {
fmt.Println()
}
// Sort this MKV's results by match percentage
sort.Slice(results[i], func(a, b int) bool {
return results[i][a].MatchPercent > results[i][b].MatchPercent
})
for _, r := range results[i] {
indicator := ""
if r.MatchPercent >= 80 {
indicator = " ← likely match"
} else if r.MatchPercent >= 40 {
indicator = " ← possible match"
}
if len(mkvPaths) > 1 {
fmt.Printf(" %s %d/%d matches (%.0f%%)%s\n",
r.SourcePath, r.MatchCount, r.TotalSamples, r.MatchPercent, indicator)
} else {
fmt.Printf(" %s %d/%d matches (%.0f%%)%s\n",
r.SourcePath, r.MatchCount, r.TotalSamples, r.MatchPercent, indicator)
}
}
}
fmt.Println()
fmt.Println("Interpretation:")
fmt.Println(" 80-100%: Very likely the correct source")
fmt.Println(" 40-80%: Possible match (may be partial content)")
fmt.Println(" <40%: Unlikely to be the source")
return nil
}
// computeProbeHashes parses an MKV and returns its probe hashes.
func computeProbeHashes(mkvPath string, windowSize int) ([]matcher.ProbeHash, error) {
parser, _, err := parseMKVWithProgress(mkvPath, "")
if err != nil {
return nil, err
}
defer parser.Close()
packets := parser.Packets()
if len(packets) == 0 {
return nil, fmt.Errorf("no packets found in MKV")
}
trackTypes := make(map[int]int)
trackNALLengthSize := make(map[int]int)
for _, t := range parser.Tracks() {
trackTypes[int(t.Number)] = t.Type
trackNALLengthSize[int(t.Number)] = matcher.NALLengthSizeForTrack(t.CodecID, t.CodecPrivate)
}
samples := samplePackets(packets, 20)
mkvFile, err := os.Open(mkvPath)
if err != nil {
return nil, fmt.Errorf("open MKV: %w", err)
}
defer mkvFile.Close()
var probeHashes []matcher.ProbeHash
for _, pkt := range samples {
readSize := pkt.Size
if readSize > 4096 {
readSize = 4096
}
if readSize < int64(windowSize) {
continue
}
data := make([]byte, readSize)
n, err := mkvFile.ReadAt(data, pkt.Offset)
if err != nil || n < windowSize {
continue
}
trackType := trackTypes[int(pkt.TrackNum)]
isVideo := trackType == mkv.TrackTypeVideo
nalLenSize := trackNALLengthSize[int(pkt.TrackNum)]
hashes := matcher.ExtractProbeHashes(data[:n], isVideo, windowSize, nalLenSize)
if len(hashes) > 0 {
probeHashes = append(probeHashes, hashes[0])
}
}
if len(probeHashes) == 0 {
return nil, fmt.Errorf("no valid hashes computed from sampled packets")
}
return probeHashes, nil
}
// samplePackets selects N packets distributed across the file:
// - 25% from first 10% of packets (early content)
// - 50% from middle 80% of packets (main content)
// - 25% from last 10% of packets (late content)
func samplePackets(packets []mkv.Packet, n int) []mkv.Packet {
if len(packets) <= n {
return packets
}
// Calculate distribution
earlyCount := n / 4 // 25% from first 10%
lateCount := n / 4 // 25% from last 10%
midCount := n - earlyCount - lateCount // 50% from middle 80%
// Calculate packet ranges
earlyEnd := len(packets) / 10
lateStart := len(packets) - len(packets)/10
if earlyEnd < 1 {
earlyEnd = 1
}
if lateStart <= earlyEnd {
lateStart = earlyEnd + 1
}
samples := make([]mkv.Packet, 0, n)
// Sample from early portion (first 10%)
if earlyCount > 0 && earlyEnd > 0 {
step := earlyEnd / earlyCount
if step < 1 {
step = 1
}
for i := 0; i < earlyEnd && len(samples) < earlyCount; i += step {
samples = append(samples, packets[i])
}
}
// Sample from middle portion (middle 80%)
midStart := earlyEnd
midEnd := lateStart
if midCount > 0 && midEnd > midStart {
step := (midEnd - midStart) / midCount
if step < 1 {
step = 1
}
for i := midStart; i < midEnd && len(samples) < earlyCount+midCount; i += step {
samples = append(samples, packets[i])
}
}
// Sample from late portion (last 10%)
if lateCount > 0 && lateStart < len(packets) {
step := (len(packets) - lateStart) / lateCount
if step < 1 {
step = 1
}
for i := lateStart; i < len(packets) && len(samples) < n; i += step {
samples = append(samples, packets[i])
}
}
return samples
}
package main
import (
"errors"
"fmt"
"io"
"os"
"path/filepath"
"syscall"
"gopkg.in/yaml.v3"
)
// relocateDedup moves an .mkvdup file and its .mkvdup.yaml sidecar to a new
// location, recalculating relative paths in the sidecar so they resolve to
// the same absolute locations from the new position.
func relocateDedup(src, dst string, force, dryRun bool) error {
// Resolve source to absolute path
absSrc, err := filepath.Abs(src)
if err != nil {
return fmt.Errorf("resolve source path: %w", err)
}
// Verify source .mkvdup file exists
srcInfo, err := os.Stat(absSrc)
if err != nil {
return fmt.Errorf("source file: %w", err)
}
if srcInfo.IsDir() {
return fmt.Errorf("source %s is a directory, expected an .mkvdup file", absSrc)
}
// Determine sidecar path
sidecarSrc := absSrc + ".yaml"
hasSidecar := true
if _, err := os.Stat(sidecarSrc); os.IsNotExist(err) {
hasSidecar = false
} else if err != nil {
return fmt.Errorf("check sidecar: %w", err)
}
// Resolve destination
absDst, err := filepath.Abs(dst)
if err != nil {
return fmt.Errorf("resolve destination path: %w", err)
}
// If destination is an existing directory, or an explicitly-directory path
// (e.g. ends with a path separator, like "/new/location/"), move into it
// with the same filename.
dstInfo, err := os.Stat(absDst)
isDirDst := false
if err == nil && dstInfo.IsDir() {
isDirDst = true
} else if os.IsNotExist(err) && len(dst) > 0 && os.IsPathSeparator(dst[len(dst)-1]) {
isDirDst = true
} else if err != nil && !os.IsNotExist(err) {
return fmt.Errorf("check destination: %w", err)
}
if isDirDst {
absDst = filepath.Join(absDst, filepath.Base(absSrc))
}
// Don't relocate to the same path
if absSrc == absDst {
return fmt.Errorf("source and destination are the same: %s", absSrc)
}
sidecarDst := absDst + ".yaml"
// Check destination doesn't already exist (unless --force)
if !force {
if _, err := os.Stat(absDst); err == nil {
return fmt.Errorf("destination %s already exists (use --force to overwrite)", absDst)
} else if !os.IsNotExist(err) {
return fmt.Errorf("check destination %s: %w", absDst, err)
}
// Always check for existing destination sidecar, even if source has none,
// to avoid leaving stale/mismatched sidecars.
if _, err := os.Stat(sidecarDst); err == nil {
return fmt.Errorf("destination sidecar %s already exists (use --force to overwrite)", sidecarDst)
} else if !os.IsNotExist(err) {
return fmt.Errorf("check destination sidecar %s: %w", sidecarDst, err)
}
}
// Read and update sidecar if it exists, preserving all YAML keys/comments
var updatedSidecar []byte
if hasSidecar {
sidecarData, err := os.ReadFile(sidecarSrc)
if err != nil {
return fmt.Errorf("read sidecar: %w", err)
}
var doc yaml.Node
if err := yaml.Unmarshal(sidecarData, &doc); err != nil {
return fmt.Errorf("parse sidecar %s: %w", sidecarSrc, err)
}
if doc.Kind != yaml.DocumentNode || len(doc.Content) == 0 {
return fmt.Errorf("sidecar %s: unexpected YAML structure", sidecarSrc)
}
root := doc.Content[0]
if root.Kind != yaml.MappingNode {
return fmt.Errorf("sidecar %s: expected YAML mapping, got %v", sidecarSrc, root.Kind)
}
// Extract current values for dedup_file and source_dir
oldDedupFile := yamlNodeValue(root, "dedup_file")
oldSourceDir := yamlNodeValue(root, "source_dir")
if oldDedupFile == "" || oldSourceDir == "" {
return fmt.Errorf("sidecar %s: missing required dedup_file or source_dir", sidecarSrc)
}
srcDir := filepath.Dir(absSrc)
dstDir := filepath.Dir(absDst)
// dedup_file should point to the new location (since the .mkvdup file
// itself is being moved). Use the basename for relative paths (sidecar
// and dedup file are always in the same directory), or the new absolute
// path if the original was absolute.
var newDedupFile string
if filepath.IsAbs(oldDedupFile) {
newDedupFile = absDst
} else {
newDedupFile = filepath.Base(absDst)
}
// source_dir points to a static location — recalculate relative to new position
newSourceDir, err := recalcRelativePath(srcDir, dstDir, oldSourceDir)
if err != nil {
return fmt.Errorf("recalculate source_dir path: %w", err)
}
// Validate that source_dir is still reachable from the new location
absSourceDir := resolveRelPath(dstDir, newSourceDir)
sdInfo, err := os.Stat(absSourceDir)
if err != nil {
return fmt.Errorf("source directory not reachable from new location: %s → %s: %w", newSourceDir, absSourceDir, err)
}
if !sdInfo.IsDir() {
return fmt.Errorf("source_dir is not a directory from new location: %s → %s", newSourceDir, absSourceDir)
}
// Update values in the YAML node tree (preserves all other keys/comments)
setYAMLNodeValue(root, "dedup_file", newDedupFile)
setYAMLNodeValue(root, "source_dir", newSourceDir)
// Recalculate relative paths in virtual_files entries
if err := recalcVirtualFiles(root, srcDir, dstDir); err != nil {
return fmt.Errorf("recalculate virtual_files paths: %w", err)
}
// Recalculate relative include patterns
recalcIncludes(root, srcDir, dstDir)
updatedSidecar, err = yaml.Marshal(&doc)
if err != nil {
return fmt.Errorf("marshal updated sidecar: %w", err)
}
}
// Dry run: print what would happen and return
if dryRun {
printInfo("Would move:\n")
printInfo(" %s → %s\n", absSrc, absDst)
if hasSidecar {
printInfo(" %s → %s\n", sidecarSrc, sidecarDst)
printInfo("\nUpdated sidecar would contain:\n")
printInfo("%s", string(updatedSidecar))
}
return nil
}
// Ensure destination directory exists
dstDir := filepath.Dir(absDst)
if err := os.MkdirAll(dstDir, 0755); err != nil {
return fmt.Errorf("create destination directory: %w", err)
}
// Move the .mkvdup file (supports cross-filesystem moves)
if err := moveFile(absSrc, absDst); err != nil {
return fmt.Errorf("move dedup file: %w", err)
}
// With --force and no source sidecar, clean up any orphaned destination
// sidecar now that the dedup move has succeeded.
if force && !hasSidecar {
if _, err := os.Stat(sidecarDst); err == nil {
if err := osRemove(sidecarDst); err != nil {
printWarn("Warning: could not remove orphaned sidecar %s: %v\n", sidecarDst, err)
}
}
}
// Write updated sidecar atomically, then remove old one.
// If sidecar write fails, rollback the dedup move.
if hasSidecar {
if err := writeFileAtomic(sidecarDst, updatedSidecar, 0644); err != nil {
if rbErr := moveFile(absDst, absSrc); rbErr != nil {
printWarn("Warning: failed to rollback dedup move: %v\n", rbErr)
}
return fmt.Errorf("write sidecar: %w", err)
}
if sidecarSrc != sidecarDst {
if err := osRemove(sidecarSrc); err != nil && !os.IsNotExist(err) {
printWarn("Warning: could not remove old sidecar %s: %v\n", sidecarSrc, err)
}
}
}
printInfo("Moved:\n")
printInfo(" %s → %s\n", absSrc, absDst)
if hasSidecar {
printInfo(" %s → %s\n", sidecarSrc, sidecarDst)
}
return nil
}
// recalcRelativePath takes a path (which may be relative to oldBase or absolute),
// resolves it to absolute, and returns it relative to newBase. If the original
// path was absolute, it is returned unchanged.
func recalcRelativePath(oldBase, newBase, path string) (string, error) {
if filepath.IsAbs(path) {
return path, nil
}
// Resolve to absolute using old base
absPath := filepath.Join(oldBase, path)
absPath = filepath.Clean(absPath)
// Make relative to new base
rel, err := filepath.Rel(newBase, absPath)
if err != nil {
return "", fmt.Errorf("make relative to %s: %w", newBase, err)
}
return rel, nil
}
// resolveRelPath resolves a path relative to baseDir. If already absolute, returns as-is.
func resolveRelPath(baseDir, path string) string {
if filepath.IsAbs(path) {
return path
}
return filepath.Clean(filepath.Join(baseDir, path))
}
// writeFileAtomic writes data to dst via a temp file + rename, ensuring
// no partially written file is left at dst on failure. The temp file is
// cleaned up automatically on any error.
func writeFileAtomic(dst string, data []byte, perm os.FileMode) error {
tmpFile, err := os.CreateTemp(filepath.Dir(dst), ".mkvdup-relocate-*.tmp")
if err != nil {
return err
}
tmpPath := tmpFile.Name()
success := false
defer func() {
if !success {
_ = osRemove(tmpPath)
}
}()
if _, err := tmpFile.Write(data); err != nil {
tmpFile.Close()
return err
}
if err := tmpFile.Close(); err != nil {
return err
}
if err := os.Chmod(tmpPath, perm); err != nil {
return err
}
if err := osRename(tmpPath, dst); err != nil {
return err
}
success = true
return nil
}
// moveFile moves a file from src to dst. It tries os.Rename first for
// efficiency; if that fails with EXDEV (cross-device), it falls back to
// copy + remove.
func moveFile(src, dst string) error {
err := osRename(src, dst)
if err == nil {
return nil
}
if !errors.Is(err, syscall.EXDEV) {
return err
}
// Cross-filesystem: copy then remove source.
if err := copyFile(src, dst); err != nil {
return fmt.Errorf("cross-device copy: %w", err)
}
if err := osRemove(src); err != nil {
return fmt.Errorf("remove source after cross-device copy: %w", err)
}
return nil
}
// copyFile copies a file from src to dst, preserving permissions.
func copyFile(src, dst string) error {
srcFile, err := os.Open(src)
if err != nil {
return err
}
defer srcFile.Close()
srcInfo, err := srcFile.Stat()
if err != nil {
return err
}
dstFile, err := os.OpenFile(dst, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, srcInfo.Mode())
if err != nil {
return err
}
if _, err := io.Copy(dstFile, srcFile); err != nil {
dstFile.Close()
_ = osRemove(dst)
return err
}
if err := dstFile.Close(); err != nil {
_ = osRemove(dst)
return err
}
return nil
}
// yamlNodeValue returns the string value for a key in a YAML mapping node.
// Returns "" if the key is not found.
func yamlNodeValue(mapping *yaml.Node, key string) string {
for i := 0; i < len(mapping.Content)-1; i += 2 {
if mapping.Content[i].Value == key {
return mapping.Content[i+1].Value
}
}
return ""
}
// setYAMLNodeValue sets the string value for a key in a YAML mapping node.
func setYAMLNodeValue(mapping *yaml.Node, key, value string) {
for i := 0; i < len(mapping.Content)-1; i += 2 {
if mapping.Content[i].Value == key {
mapping.Content[i+1].Value = value
return
}
}
}
// recalcVirtualFiles recalculates relative dedup_file and source_dir paths
// in virtual_files entries (a YAML sequence of mappings).
func recalcVirtualFiles(root *yaml.Node, srcDir, dstDir string) error {
vfNode := yamlNodeByKey(root, "virtual_files")
if vfNode == nil || vfNode.Kind != yaml.SequenceNode {
return nil
}
for i, entry := range vfNode.Content {
if entry.Kind != yaml.MappingNode {
continue
}
// Recalculate dedup_file (points to a static file, not being moved)
if old := yamlNodeValue(entry, "dedup_file"); old != "" {
recalced, err := recalcRelativePath(srcDir, dstDir, old)
if err != nil {
return fmt.Errorf("virtual_files[%d].dedup_file: %w", i, err)
}
setYAMLNodeValue(entry, "dedup_file", recalced)
}
// Recalculate source_dir
if old := yamlNodeValue(entry, "source_dir"); old != "" {
recalced, err := recalcRelativePath(srcDir, dstDir, old)
if err != nil {
return fmt.Errorf("virtual_files[%d].source_dir: %w", i, err)
}
setYAMLNodeValue(entry, "source_dir", recalced)
}
}
return nil
}
// recalcIncludes recalculates relative include glob patterns in the sidecar.
func recalcIncludes(root *yaml.Node, srcDir, dstDir string) {
inclNode := yamlNodeByKey(root, "includes")
if inclNode == nil || inclNode.Kind != yaml.SequenceNode {
return
}
for _, entry := range inclNode.Content {
if entry.Kind != yaml.ScalarNode || filepath.IsAbs(entry.Value) {
continue
}
// Recalculate the relative portion. Glob patterns may contain
// wildcards, but the directory prefix is what needs adjusting.
// recalcRelativePath works on the path as-is since filepath.Rel
// handles non-existent paths fine.
recalced, err := recalcRelativePath(srcDir, dstDir, entry.Value)
if err == nil {
entry.Value = recalced
}
}
}
// yamlNodeByKey returns the value node for a key in a YAML mapping node.
// Returns nil if the key is not found.
func yamlNodeByKey(mapping *yaml.Node, key string) *yaml.Node {
for i := 0; i < len(mapping.Content)-1; i += 2 {
if mapping.Content[i].Value == key {
return mapping.Content[i+1]
}
}
return nil
}
package main
import (
"fmt"
"os"
"github.com/stuckj/mkvdup/internal/dedup"
)
// fileStats holds statistics for a single dedup file.
type fileStats struct {
name string
dedupFile string
sourceDir string
origSize int64
dedupSize int64
sourceType string
sourceFiles int
entryCount int
err error
}
// showStats displays space savings and file statistics for mkvdup-managed files.
func showStats(configPaths []string, configDir bool) error {
resolved, err := resolveConfigPaths(configPaths, configDir)
if err != nil {
return err
}
// Resolve each config independently so a single bad config doesn't
// abort the entire stats run.
var configs []dedup.Config
for _, cfgPath := range resolved {
cfgs, _, _, cfgErr := dedup.ResolveConfigs([]string{cfgPath})
if cfgErr != nil {
printWarn("Failed to load config %s: %v\n", cfgPath, cfgErr)
continue
}
configs = append(configs, cfgs...)
}
if len(configs) == 0 {
printInfoln("No files found.")
return nil
}
var stats []fileStats
for _, cfg := range configs {
fs := collectFileStats(cfg)
stats = append(stats, fs)
if fs.err != nil {
printWarn("%s\n Error: %v\n\n", fs.name, fs.err)
continue
}
printFileStats(fs)
}
printRollupStats(stats)
return nil
}
// collectFileStats gathers statistics for a single dedup file from its config.
func collectFileStats(cfg dedup.Config) fileStats {
fs := fileStats{
name: cfg.Name,
dedupFile: cfg.DedupFile,
sourceDir: cfg.SourceDir,
}
reader, err := dedup.NewReaderLazy(cfg.DedupFile, cfg.SourceDir)
if err != nil {
fs.err = fmt.Errorf("open dedup file: %w", err)
return fs
}
defer reader.Close()
info := reader.Info()
if errMsg, ok := info["error"]; ok {
fs.err = fmt.Errorf("read dedup file: %v", errMsg)
return fs
}
fs.origSize = info["original_size"].(int64)
fs.sourceFiles = info["source_file_count"].(int)
fs.entryCount = info["entry_count"].(int)
switch info["source_type"].(uint8) {
case 0:
fs.sourceType = "DVD"
case 1:
fs.sourceType = "Blu-ray"
default:
fs.sourceType = "Unknown"
}
dedupInfo, err := os.Stat(cfg.DedupFile)
if err != nil {
fs.err = fmt.Errorf("stat dedup file: %w", err)
return fs
}
fs.dedupSize = dedupInfo.Size()
return fs
}
// printFileStats prints per-file statistics.
func printFileStats(fs fileStats) {
savings := float64(0)
if fs.origSize > 0 {
savings = float64(fs.origSize-fs.dedupSize) / float64(fs.origSize) * 100
}
printInfo("%s\n", fs.name)
printInfo(" Original size: %s bytes (%s)\n", formatInt(fs.origSize), formatSize(fs.origSize))
printInfo(" Dedup file size: %s bytes (%s)\n", formatInt(fs.dedupSize), formatSize(fs.dedupSize))
printInfo(" Space savings: %s bytes (%.2f%%)\n", formatInt(fs.origSize-fs.dedupSize), savings)
printInfo(" Source type: %s\n", fs.sourceType)
printInfo(" Source directory: %s\n", fs.sourceDir)
printInfo(" Source files: %d\n", fs.sourceFiles)
printInfo(" Index entries: %s\n", formatInt(int64(fs.entryCount)))
printInfoln()
}
// printRollupStats prints aggregate statistics across all successful files.
func printRollupStats(stats []fileStats) {
var totalOrig, totalDedup int64
var succeeded int
uniqueSources := map[string]struct{}{}
for _, fs := range stats {
if fs.err != nil {
continue
}
succeeded++
totalOrig += fs.origSize
totalDedup += fs.dedupSize
uniqueSources[fs.sourceDir] = struct{}{}
}
if succeeded < 2 {
return
}
savings := float64(0)
if totalOrig > 0 {
savings = float64(totalOrig-totalDedup) / float64(totalOrig) * 100
}
printInfo("Totals (%d %s):\n", succeeded, plural(succeeded, "file", "files"))
printInfo(" Original size: %s bytes (%s)\n", formatInt(totalOrig), formatSize(totalOrig))
printInfo(" Dedup file size: %s bytes (%s)\n", formatInt(totalDedup), formatSize(totalDedup))
printInfo(" Space savings: %s bytes (%.2f%%)\n", formatInt(totalOrig-totalDedup), savings)
printInfo(" Unique sources: %d\n", len(uniqueSources))
}
package main
import (
"fmt"
"os"
"path"
"path/filepath"
"slices"
"strings"
"github.com/stuckj/mkvdup/internal/dedup"
)
// validationEntry tracks the result of validating a single resolved config entry.
type validationEntry struct {
name string // virtual file name
status string // "OK", "WARN", "ERR"
message string // detail message (empty for OK)
configFile string // which input config file this came from
dedupFile string // resolved dedup file path
}
// validateConfigEntries resolves and validates each config file: YAML parsing,
// path existence checks, and dedup file header validation. Returns the
// validation entries, the successfully-parsed configs, and whether any errors
// were found.
func validateConfigEntries(configPaths []string) ([]validationEntry, []dedup.Config, bool) {
var allEntries []validationEntry
var allConfigs []dedup.Config
hasErrors := false
for _, configPath := range configPaths {
fmt.Printf("Validating %s...\n", filepath.Base(configPath))
configs, _, _, err := dedup.ResolveConfigs([]string{configPath})
if err != nil {
fmt.Printf(" ERR %s\n", err)
allEntries = append(allEntries, validationEntry{
name: filepath.Base(configPath),
status: "ERR",
message: err.Error(),
configFile: configPath,
})
hasErrors = true
continue
}
if len(configs) == 0 {
fmt.Printf(" (no entries)\n")
continue
}
for _, cfg := range configs {
entry := validationEntry{
name: cfg.Name,
status: "OK",
configFile: configPath,
dedupFile: cfg.DedupFile,
}
// Check dedup file exists
dedupStat, err := os.Stat(cfg.DedupFile)
if err != nil {
entry.status = "ERR"
entry.message = fmt.Sprintf("dedup file: %v", err)
fmt.Printf(" ERR %s: %s\n", cfg.Name, entry.message)
allEntries = append(allEntries, entry)
hasErrors = true
continue
}
if dedupStat.IsDir() {
entry.status = "ERR"
entry.message = fmt.Sprintf("dedup file is a directory: %s", cfg.DedupFile)
fmt.Printf(" ERR %s: %s\n", cfg.Name, entry.message)
allEntries = append(allEntries, entry)
hasErrors = true
continue
}
// Check source dir exists and is a directory
sourceStat, err := os.Stat(cfg.SourceDir)
if err != nil {
entry.status = "ERR"
entry.message = fmt.Sprintf("source directory: %v", err)
fmt.Printf(" ERR %s: %s\n", cfg.Name, entry.message)
allEntries = append(allEntries, entry)
hasErrors = true
continue
}
if !sourceStat.IsDir() {
entry.status = "ERR"
entry.message = fmt.Sprintf("source path is not a directory: %s", cfg.SourceDir)
fmt.Printf(" ERR %s: %s\n", cfg.Name, entry.message)
allEntries = append(allEntries, entry)
hasErrors = true
continue
}
// Validate dedup file header
reader, err := dedup.NewReaderLazy(cfg.DedupFile, cfg.SourceDir)
if err != nil {
entry.status = "ERR"
entry.message = fmt.Sprintf("invalid dedup file: %v", err)
fmt.Printf(" ERR %s: %s\n", cfg.Name, entry.message)
allEntries = append(allEntries, entry)
hasErrors = true
continue
}
reader.Close()
allEntries = append(allEntries, entry)
allConfigs = append(allConfigs, cfg)
}
}
return allEntries, allConfigs, hasErrors
}
// checkNameConflicts validates virtual file paths and detects duplicate names
// and file/directory conflicts across all entries. Updates entry statuses
// in-place and returns whether any errors or warnings were found.
func checkNameConflicts(entries []validationEntry) (hasErrors, hasWarnings bool) {
nameToConfig := make(map[string]string) // clean path -> config file
dirComponents := make(map[string]string) // paths used as directories -> config file
fileComponents := make(map[string]string) // paths used as files -> config file
for i, entry := range entries {
if entry.status == "ERR" {
continue
}
name := entry.name
// Check for ".." path components
if slices.Contains(strings.Split(name, "/"), "..") {
entries[i].status = "ERR"
entries[i].message = "invalid path: contains '..' component"
fmt.Printf(" ERR %s: %s\n", name, entries[i].message)
hasErrors = true
continue
}
// Clean and validate the path (same logic as tree.go insertFile)
cleanPath := cleanVirtualPath(name)
if cleanPath == "" {
entries[i].status = "ERR"
entries[i].message = "invalid path: empty after cleaning"
fmt.Printf(" ERR %s: %s\n", name, entries[i].message)
hasErrors = true
continue
}
// Check for duplicate names
if prevConfig, exists := nameToConfig[cleanPath]; exists {
entries[i].status = "WARN"
entries[i].message = fmt.Sprintf("duplicate name (also in %s)", filepath.Base(prevConfig))
fmt.Printf(" WARN %s: %s\n", name, entries[i].message)
hasWarnings = true
continue
}
nameToConfig[cleanPath] = entry.configFile
// Check for file/directory conflicts
parts := strings.Split(cleanPath, "/")
conflictFound := false
// Check if any prefix of this path is used as a file
for j := 0; j < len(parts)-1; j++ {
dirPath := strings.Join(parts[:j+1], "/")
if prevConfig, exists := fileComponents[dirPath]; exists {
entries[i].status = "WARN"
entries[i].message = fmt.Sprintf("path component %q conflicts with file in %s", dirPath, filepath.Base(prevConfig))
fmt.Printf(" WARN %s: %s\n", name, entries[i].message)
hasWarnings = true
conflictFound = true
break
}
// Record as directory component
if _, exists := dirComponents[dirPath]; !exists {
dirComponents[dirPath] = entry.configFile
}
}
if conflictFound {
continue
}
// Check if this file name conflicts with a directory
if prevConfig, exists := dirComponents[cleanPath]; exists {
entries[i].status = "WARN"
entries[i].message = fmt.Sprintf("conflicts with directory from %s", filepath.Base(prevConfig))
fmt.Printf(" WARN %s: %s\n", name, entries[i].message)
hasWarnings = true
continue
}
fileComponents[cleanPath] = entry.configFile
// Print OK for entries that passed all checks
if entries[i].status == "OK" {
fmt.Printf(" OK %s\n", name)
}
}
return hasErrors, hasWarnings
}
// runDeepValidation performs integrity verification on dedup files that passed
// basic validation. Returns whether any errors were found.
func runDeepValidation(entries []validationEntry, configs []dedup.Config) bool {
fmt.Println()
fmt.Println("Running deep validation...")
hasErrors := false
for _, cfg := range configs {
// Only deep-validate entries that passed basic validation
entryOK := false
for _, e := range entries {
if e.name == cfg.Name && e.dedupFile == cfg.DedupFile && e.status != "ERR" {
entryOK = true
break
}
}
if !entryOK {
continue
}
reader, err := dedup.NewReader(cfg.DedupFile, cfg.SourceDir)
if err != nil {
fmt.Printf(" ERR %s: failed to open: %v\n", cfg.Name, err)
hasErrors = true
continue
}
if err := reader.VerifyIntegrity(); err != nil {
fmt.Printf(" ERR %s: integrity check failed: %v\n", cfg.Name, err)
reader.Close()
hasErrors = true
continue
}
reader.Close()
fmt.Printf(" OK %s: checksums valid\n", cfg.Name)
}
return hasErrors
}
// validateConfigs validates configuration files and returns an exit code.
// Returns 0 if all configs are valid (warnings OK without strict), 1 otherwise.
func validateConfigs(configPaths []string, configDir, deep, strict bool) int {
resolved, err := resolveConfigPaths(configPaths, configDir)
if err != nil {
printWarn("Error: %v\n", err)
return 1
}
allEntries, allConfigs, hasErrors := validateConfigEntries(resolved)
nameErrors, hasWarnings := checkNameConflicts(allEntries)
hasErrors = hasErrors || nameErrors
if deep {
hasErrors = hasErrors || runDeepValidation(allEntries, allConfigs)
}
// Print summary
var okCount, warnCount, errCount int
for _, e := range allEntries {
switch e.status {
case "OK":
okCount++
case "WARN":
warnCount++
case "ERR":
errCount++
}
}
fmt.Println()
fmt.Printf("Summary: %d %s, %d valid, %d %s, %d %s\n",
len(allEntries), plural(len(allEntries), "entry", "entries"),
okCount,
warnCount, plural(warnCount, "warning", "warnings"),
errCount, plural(errCount, "error", "errors"))
if hasErrors {
return 1
}
if strict && hasWarnings {
return 1
}
return 0
}
// cleanVirtualPath normalizes a virtual file path, matching the logic in
// internal/fuse/tree.go insertFile(). Returns empty string if the path is invalid.
func cleanVirtualPath(name string) string {
// Clean the path using path.Clean (not filepath.Clean) to match
// internal/fuse/tree.go insertFile() which uses forward-slash paths.
cleaned := path.Clean(name)
// Split and filter
parts := strings.Split(cleaned, "/")
var valid []string
for _, p := range parts {
if p != "" && p != "." {
valid = append(valid, p)
}
}
if len(valid) == 0 {
return ""
}
return strings.Join(valid, "/")
}
package main
import (
"bytes"
"fmt"
"io"
"os"
"path/filepath"
"github.com/stuckj/mkvdup/internal/dedup"
"github.com/stuckj/mkvdup/internal/source"
)
// verifyReconstructionFunc is the function used for post-create verification.
// It can be overridden in tests to simulate verification failures.
var verifyReconstructionFunc = verifyReconstruction
// verifyReconstruction verifies that the dedup file can reconstruct the original MKV.
// If phasePrefix is non-empty, a progress bar is shown.
func verifyReconstruction(dedupPath, sourceDir, originalPath string, index *source.Index, phasePrefix string) error {
reader, err := dedup.NewReader(dedupPath, sourceDir)
if err != nil {
return fmt.Errorf("open dedup file: %w", err)
}
defer reader.Close()
if err := reader.LoadSourceFiles(); err != nil {
return fmt.Errorf("load source files: %w", err)
}
// Open original MKV
original, err := os.Open(originalPath)
if err != nil {
return fmt.Errorf("open original: %w", err)
}
defer original.Close()
// Debug: show first few bytes comparison (controlled by verboseWriter; may be enabled via -v/--verbose or --log-verbose + --log-file)
if vw := verboseWriter(); vw != nil {
origFirst := make([]byte, 32)
reconFirst := make([]byte, 32)
n, _ := original.ReadAt(origFirst, 0)
fmt.Fprintf(vw, " Debug: Original ReadAt(32, 0) returned %d bytes\n", n)
n, _ = reader.ReadAt(reconFirst, 0)
fmt.Fprintf(vw, " Debug: Reader ReadAt(32, 0) returned %d bytes\n", n)
fmt.Fprintf(vw, " Debug: Original first 32 bytes: %x\n", origFirst)
fmt.Fprintf(vw, " Debug: Reconstructed first 32 bytes: %x\n", reconFirst)
original.Seek(0, 0) // Reset file position
}
totalSize := reader.OriginalSize()
var bar *progressBar
if phasePrefix != "" {
bar = newProgressBar(phasePrefix, totalSize, "bytes")
defer bar.Cancel() // clean up if we return early on error
}
// Compare chunk by chunk
const chunkSize = 1024 * 1024 // 1MB
originalBuf := make([]byte, chunkSize)
reconstructedBuf := make([]byte, chunkSize)
var offset int64
for {
n1, err1 := original.Read(originalBuf)
if n1 == 0 && err1 == io.EOF {
break
}
n2, err2 := reader.ReadAt(reconstructedBuf[:n1], offset)
if vw := verboseWriter(); vw != nil && offset == 0 {
fmt.Fprintf(vw, " Debug: Loop first read - n1=%d, n2=%d, err1=%v, err2=%v\n", n1, n2, err1, err2)
fmt.Fprintf(vw, " Debug: originalBuf first 32: %x\n", originalBuf[:32])
fmt.Fprintf(vw, " Debug: reconstructedBuf first 32: %x\n", reconstructedBuf[:32])
}
if n1 != n2 {
return fmt.Errorf("size mismatch at offset %d: original=%d, reconstructed=%d", offset, n1, n2)
}
if !bytes.Equal(originalBuf[:n1], reconstructedBuf[:n2]) {
// Find first mismatch
for i := 0; i < n1; i++ {
if originalBuf[i] != reconstructedBuf[i] {
return fmt.Errorf("data mismatch at offset %d (orig: %02x, recon: %02x)",
offset+int64(i), originalBuf[i], reconstructedBuf[i])
}
}
}
offset += int64(n1)
if bar != nil {
bar.Update(offset)
}
if err1 != nil && err1 != io.EOF {
return fmt.Errorf("read original at %d: %w", offset, err1)
}
if err2 != nil && err2 != io.EOF {
return fmt.Errorf("read reconstructed at %d: %w", offset, err2)
}
}
if bar != nil {
bar.Finish()
}
return nil
}
// openDedupReader opens a dedup file with its source directory, verifies
// integrity, loads source files, and checks source file sizes. This is the
// shared preamble for verify, extract, and similar commands.
func openDedupReader(dedupPath, sourceDir string) (*dedup.Reader, error) {
reader, err := dedup.NewReader(dedupPath, sourceDir)
if err != nil {
return nil, fmt.Errorf("open dedup file: %w", err)
}
printInfo("Verifying dedup file checksums...")
if err := reader.VerifyIntegrity(); err != nil {
printInfoln(" FAILED")
reader.Close()
return nil, fmt.Errorf("integrity check: %w", err)
}
printInfoln(" OK")
if err := reader.LoadSourceFiles(); err != nil {
reader.Close()
return nil, fmt.Errorf("load source files: %w", err)
}
printInfo("Verifying source files...")
for _, sf := range reader.SourceFiles() {
path := filepath.Join(sourceDir, sf.RelativePath)
stat, err := os.Stat(path)
if err != nil {
printInfoln(" FAILED")
reader.Close()
return nil, fmt.Errorf("source file %s: %w", sf.RelativePath, err)
}
if stat.Size() != sf.Size {
printInfoln(" FAILED")
reader.Close()
return nil, fmt.Errorf("source file %s size mismatch: expected %d, got %d",
sf.RelativePath, sf.Size, stat.Size())
}
}
printInfoln(" OK")
return reader, nil
}
// verifyDedup verifies a dedup file against the original MKV.
func verifyDedup(dedupPath, sourceDir, originalPath string) error {
printInfo("Verifying dedup file: %s\n", dedupPath)
printInfo("Source directory: %s\n", sourceDir)
printInfo("Original MKV: %s\n", originalPath)
printInfoln()
reader, err := openDedupReader(dedupPath, sourceDir)
if err != nil {
return err
}
defer reader.Close()
// Verify reconstruction matches original
original, err := os.Open(originalPath)
if err != nil {
return fmt.Errorf("open original: %w", err)
}
defer original.Close()
totalSize := reader.OriginalSize()
bar := newProgressBar("Verifying reconstruction...", totalSize, "bytes")
defer bar.Cancel() // clean up if we return early on error
const chunkSize = 4 * 1024 * 1024
originalBuf := make([]byte, chunkSize)
reconstructedBuf := make([]byte, chunkSize)
var offset int64
for offset < totalSize {
remaining := totalSize - offset
readSize := int64(chunkSize)
if readSize > remaining {
readSize = remaining
}
n1, err1 := original.Read(originalBuf[:readSize])
n2, err2 := reader.ReadAt(reconstructedBuf[:readSize], offset)
if n1 != n2 {
return fmt.Errorf("size mismatch at offset %d", offset)
}
if !bytes.Equal(originalBuf[:n1], reconstructedBuf[:n2]) {
for i := 0; i < n1; i++ {
if originalBuf[i] != reconstructedBuf[i] {
return fmt.Errorf("data mismatch at offset %d", offset+int64(i))
}
}
}
if err1 != nil && err1 != io.EOF {
return fmt.Errorf("read original: %w", err1)
}
if err2 != nil && err2 != io.EOF {
return fmt.Errorf("read reconstructed: %w", err2)
}
offset += int64(n1)
bar.Update(offset)
}
bar.Finish()
printInfoln()
printInfoln("Verification PASSED")
return nil
}
// extractDedup rebuilds the original MKV from a dedup file and source.
func extractDedup(dedupPath, sourceDir, outputPath string) (retErr error) {
printInfo("Dedup file: %s\n", dedupPath)
printInfo("Source directory: %s\n", sourceDir)
printInfo("Output MKV: %s\n", outputPath)
printInfoln()
reader, err := openDedupReader(dedupPath, sourceDir)
if err != nil {
return err
}
defer reader.Close()
out, err := os.Create(outputPath)
if err != nil {
return fmt.Errorf("create output file: %w", err)
}
defer func() {
// Only close if not already closed by the success path below.
// On error, clean up the partial output file.
if retErr != nil {
out.Close()
os.Remove(outputPath)
}
}()
totalSize := reader.OriginalSize()
bar := newProgressBar("Extracting...", totalSize, "bytes")
defer bar.Cancel() // clean up if we return early on error
const chunkSize = 4 * 1024 * 1024
buf := make([]byte, chunkSize)
var offset int64
for offset < totalSize {
remaining := totalSize - offset
readSize := int64(chunkSize)
if readSize > remaining {
readSize = remaining
}
n, err := reader.ReadAt(buf[:readSize], offset)
if err != nil && err != io.EOF {
return fmt.Errorf("read at offset %d: %w", offset, err)
}
if n == 0 {
return fmt.Errorf("unexpected EOF at offset %d (expected %d bytes)", offset, totalSize)
}
if _, err := out.Write(buf[:n]); err != nil {
return fmt.Errorf("write at offset %d: %w", offset, err)
}
offset += int64(n)
bar.Update(offset)
}
bar.Finish()
if err := out.Close(); err != nil {
return fmt.Errorf("close output: %w", err)
}
printInfo("\nExtracted %s bytes to %s\n", formatInt(totalSize), outputPath)
return nil
}
package main
import (
"fmt"
"os"
)
func printUsage() {
fmt.Print(`mkvdup - MKV deduplication tool using FUSE
Usage: mkvdup [options] <command> [args...]
Commands:
create Create dedup file from MKV + source directory
batch-create Create multiple dedup files from one source
probe Quick test if MKV matches source(s)
mount Mount dedup files as FUSE filesystem
info Show dedup file information
verify Verify dedup file against original MKV
extract Rebuild original MKV from dedup + source
check Check dedup + source file integrity
stats Show space savings and file statistics
validate Validate configuration files
reload Reload running daemon's configuration
expand-config Expand wildcard config to explicit file list
relocate Move dedup file + sidecar, updating paths
Analysis commands:
deltadiag Analyze unmatched regions by stream type
Debug commands:
parse-mkv Parse MKV and show packet info
index-source Index source directory
match Match MKV packets to source
Options:
-v, --verbose Enable verbose output
-q, --quiet Suppress informational progress output
--no-progress Disable progress bars (still show status messages)
--log-file PATH Duplicate output to a log file (non-TTY style)
--log-verbose Enable verbose output in log file only
-h, --help Show help
--version Show version
`)
fmt.Print(debugOptionsHelp())
fmt.Print(`Run 'mkvdup <command> --help' for more information on a command.
See 'man mkvdup' for detailed documentation.
`)
}
func printCommandUsage(cmd string) {
switch cmd {
case "create":
printCreateUsage()
case "batch-create":
printBatchCreateUsage()
case "probe":
printProbeUsage()
case "mount":
printMountUsage()
case "info":
printInfoUsage()
case "verify":
printVerifyUsage()
case "extract":
printExtractUsage()
case "check":
printCheckUsage()
case "stats":
printStatsUsage()
case "validate":
printValidateUsage()
case "reload":
printReloadUsage()
case "expand-config":
printExpandConfigUsage()
case "relocate":
printRelocateUsage()
case "deltadiag":
printDeltadiagUsage()
case "parse-mkv":
printParseMKVUsage()
case "index-source":
printIndexSourceUsage()
case "match":
printMatchUsage()
default:
printUsage()
}
}
func printCreateUsage() {
fmt.Print(`Usage: mkvdup create [options] <mkv-file> <source-dir> <output> [name]
Create a dedup file from an MKV and its source media.
Arguments:
<mkv-file> Path to the MKV file to deduplicate
<source-dir> Directory containing source media (ISO files or BDMV folders)
<output> Output .mkvdup file path
[name] Display name in FUSE mount (default: basename of mkv-file;
.mkv extension auto-added if missing)
Options:
-v, --verbose Enable verbose/debug output
--log-file PATH Duplicate output to a log file (non-TTY style)
--log-verbose Enable verbose output in log file only
--warn-threshold N Minimum space savings percentage to avoid warning (default: 75)
--non-interactive Don't prompt on codec mismatch (show warning and continue)
Before matching, codecs in the MKV are compared against the source media.
If a mismatch is detected (e.g., MKV has H.264 but source is MPEG-2), you
will be prompted to continue. Use --non-interactive for scripted usage.
After writing, the dedup file is verified against the original MKV. If
verification fails, the output is renamed to <output>.failed and the
command exits with code 1.
Examples:
mkvdup create movie.mkv /media/dvd-backups movie.mkvdup
mkvdup create movie.mkv /media/dvd-backups movie.mkvdup "My Movie"
mkvdup create --warn-threshold 50 movie.mkv /media/dvd-backups movie.mkvdup
mkvdup create --non-interactive movie.mkv /media/dvd-backups movie.mkvdup
`)
}
func printBatchCreateUsage() {
fmt.Print(`Usage: mkvdup batch-create [options] <manifest.yaml>
Create multiple dedup files from a YAML manifest. Files sharing the same
source directory are grouped and the source is indexed once per group.
Codec compatibility is checked for each file. If a mismatch is detected,
a warning is printed but processing continues (non-interactive mode).
Use --skip-codec-mismatch to skip mismatched files instead.
Arguments:
<manifest.yaml> YAML manifest file specifying source(s) and MKV files
Options:
-v, --verbose Enable verbose/debug output
--log-file PATH Duplicate output to a log file (non-TTY style)
--log-verbose Enable verbose output in log file only
--warn-threshold N Minimum space savings percentage to avoid warning (default: 75)
--skip-codec-mismatch Skip MKVs with codec mismatch instead of processing them
Manifest format:
source_dir: /media/dvd-backups/disc1 # default for all files (optional)
files:
- mkv: episode1.mkv
output: episode1.mkvdup
name: "Show/S01/Episode 1" # optional (.mkv auto-added)
- mkv: episode2.mkv
output: episode2.mkvdup
- mkv: movie.mkv
output: movie.mkvdup
source_dir: /media/dvd-backups/disc2 # per-file override
Fields:
source_dir Default source directory (optional if all files specify their own)
files List of MKV files to process (required, at least one)
files[].mkv Path to MKV file (required)
files[].output Output .mkvdup file (required)
files[].source_dir Source directory for this file (overrides top-level default)
files[].name Display name in FUSE mount (default: basename of mkv;
.mkv extension auto-added if missing)
Relative paths are resolved against the manifest file's directory.
Partial failure handling:
If one file fails, processing continues for the remaining files.
If verification fails for a file, the output is renamed to <output>.failed
and shown as FAIL in the summary.
Exit code is 0 if any file succeeded (including cached outputs from
prior runs), or if all files were skipped.
Exit code is 1 only if all processed files failed.
Examples:
mkvdup batch-create episodes.yaml
mkvdup batch-create --warn-threshold 50 episodes.yaml
mkvdup batch-create --skip-codec-mismatch episodes.yaml
`)
}
func printProbeUsage() {
fmt.Print(`Usage: mkvdup probe <mkv-file>... -- <source-dir>...
Quick test to check if MKV file(s) match one or more source directories.
When multiple MKVs are provided, each source is indexed only once.
Arguments:
<mkv-file> One or more MKV files to test (before --)
-- Separator between MKV files and source directories
<source-dir> One or more directories to test against (after --)
For backward compatibility, a single MKV without -- is also supported:
mkvdup probe movie.mkv /media/disc1 /media/disc2
Examples:
mkvdup probe movie.mkv /media/disc1 /media/disc2
mkvdup probe ep1.mkv ep2.mkv ep3.mkv -- /media/disc1 /media/disc2
`)
}
func printMountUsage() {
os.Stdout.WriteString(`Usage: mkvdup mount [options] <mountpoint> [config.yaml...]
Mount dedup files as a FUSE filesystem.
Arguments:
<mountpoint> Directory to mount the filesystem
[config.yaml] YAML config files (default: /etc/mkvdup.conf)
Options:
--allow-other Allow other users to access the mount
--foreground Run in foreground (for debugging or systemd)
--config-dir Treat config argument as directory of YAML files (.yaml, .yml)
--pid-file PATH Write daemon PID to file
--daemon-timeout DUR Timeout waiting for daemon startup (default: 30s)
Permission Options:
--default-uid UID Default UID for files and directories (default: calling user's UID)
--default-gid GID Default GID for files and directories (default: calling user's GID)
--default-file-mode MODE Default mode for files (octal, default: 0444)
--default-dir-mode MODE Default mode for directories (octal, default: 0555)
--permissions-file PATH Path to permissions file (overrides default locations)
Source Watch Options:
--no-source-watch Disable source file monitoring (enabled by default)
--on-source-change ACTION Action on source change: warn, disable, checksum (default)
warn - log a warning
disable - disable affected virtual files (reads return EIO)
checksum - size change: disable immediately
timestamp-only: verify checksum in background,
disable on mismatch, re-enable on pass
--source-watch-poll-interval DUR Poll interval for source file changes (default: 60s)
--source-read-timeout DUR Read timeout for network FS sources (default: 30s)
Config Watch Options:
--no-config-watch Disable config file monitoring (enabled by default)
--on-config-change ACTION Action on config change: reload (default), warn
reload - automatically reload configuration
warn - log a warning only
Error Notification (configured in YAML config, not CLI):
on_error_command:
command: ["/path/to/script", "%source%", "%event%", "%files%"]
timeout: 30s # command timeout (default: 30s)
batch_interval: 5s # debounce window for batching events (default: 5s)
Placeholders: %source% (path), %files% (affected files), %event% (error type)
String form (sh -c) auto-escapes placeholders; do not add your own quotes.
See docs/FUSE.md for details.
By default, mkvdup daemonizes after the mount is ready and returns.
Use --foreground to keep it attached to the terminal.
Permission files are searched in order:
1. --permissions-file (if specified)
2. ~/.config/mkvdup/permissions.yaml (if exists)
3. /etc/mkvdup/permissions.yaml (if exists)
New permissions are written to ~/.config/mkvdup/permissions.yaml (user) or
/etc/mkvdup/permissions.yaml (root).
Examples:
mkvdup mount /mnt/videos movie.mkvdup.yaml
mkvdup mount /mnt/videos *.yaml
mkvdup mount --allow-other /mnt/videos
mkvdup mount --config-dir /mnt/videos /etc/mkvdup.d/
mkvdup mount --foreground /mnt/videos config.yaml
mkvdup mount --default-uid 1000 --default-gid 1000 /mnt/videos config.yaml
mkvdup mount --source-watch-poll-interval 10s /mnt/videos config.yaml
mkvdup mount --source-read-timeout 1m /mnt/videos config.yaml
`)
}
func printInfoUsage() {
fmt.Print(`Usage: mkvdup info [options] <dedup-file>
Show information about a dedup file.
Arguments:
<dedup-file> Path to the .mkvdup file
Options:
--hide-unused-files Hide source files not referenced by any index entry
Examples:
mkvdup info movie.mkvdup
mkvdup info --hide-unused-files movie.mkvdup
`)
}
func printVerifyUsage() {
fmt.Print(`Usage: mkvdup verify <dedup-file> <source-dir> <original-mkv>
Verify that a dedup file correctly reconstructs the original MKV.
Arguments:
<dedup-file> Path to the .mkvdup file
<source-dir> Directory containing the source media
<original-mkv> Path to the original MKV for comparison
Examples:
mkvdup verify movie.mkvdup /media/dvd-backups original.mkv
`)
}
func printExtractUsage() {
fmt.Print(`Usage: mkvdup extract <dedup-file> <source-dir> <output-mkv>
Rebuild the original MKV from a dedup file and source media.
Arguments:
<dedup-file> Path to the .mkvdup file
<source-dir> Directory containing the source media
<output-mkv> Path for the reconstructed MKV file
Examples:
mkvdup extract movie.mkvdup /media/dvd-backups restored-movie.mkv
`)
}
func printCheckUsage() {
fmt.Print(`Usage: mkvdup check <dedup-file> <source-dir> [options]
Check integrity of a dedup file and its source files.
Arguments:
<dedup-file> Path to the .mkvdup file
<source-dir> Directory containing the source media
Options:
--source-checksums Verify source file checksums (slow, reads entire files)
Checks performed:
- Dedup file header validity (magic, version, structure)
- Index and delta checksum verification
- Source file existence and size
With --source-checksums:
- Source file checksum verification (reads entire files)
Examples:
mkvdup check movie.mkvdup /media/dvd-backups
mkvdup check --source-checksums movie.mkvdup /media/dvd-backups
`)
}
func printStatsUsage() {
fmt.Print(`Usage: mkvdup stats [options] <config.yaml...>
Show space savings and file statistics for mkvdup-managed files.
Arguments:
<config.yaml> YAML config files (same format as mount/validate)
Options:
--config-dir Treat config argument as directory of YAML files (.yaml, .yml)
Output includes per-file statistics (original size, dedup file size, space
savings, source type) and a rollup summary when multiple files are present.
Examples:
mkvdup stats config.yaml
mkvdup stats --config-dir /etc/mkvdup.d/
mkvdup stats movie1.yaml movie2.yaml
`)
}
func printValidateUsage() {
fmt.Print(`Usage: mkvdup validate [options] <config.yaml...>
Validate configuration files for correctness before mounting.
Arguments:
<config.yaml> YAML config files to validate
Options:
--config-dir Treat config argument as directory of YAML files (.yaml, .yml)
--deep Verify dedup file headers and internal checksums
--strict Treat warnings as errors (exit 1 on warnings)
Validations performed:
- YAML syntax and required fields (name, dedup_file, source_dir)
- Include cycle detection
- Dedup file existence and header validity
- Source directory existence
- Duplicate virtual file names (warning)
- File/directory path conflicts (warning)
- Invalid path names (empty, contains "..")
With --deep:
- Dedup file internal checksum verification
Exit codes:
0 All configs valid (warnings may be present)
1 Errors found (or warnings with --strict)
Examples:
mkvdup validate config.yaml
mkvdup validate *.yaml
mkvdup validate --config-dir /etc/mkvdup.d/
mkvdup validate --deep --strict /etc/mkvdup.conf
`)
}
func printReloadUsage() {
fmt.Print(`Usage: mkvdup reload {--pid-file PATH | --pid PID} [options] [config.yaml...]
Reload a running daemon's configuration by validating the config
and sending SIGHUP to the daemon process.
The config is validated BEFORE sending the signal. If validation
fails, the signal is not sent and the error is reported.
If no config files are specified, the signal is sent without
pre-validation (the daemon validates internally on SIGHUP).
Arguments:
[config.yaml] Config files to validate (same as mount's config args)
Required (one of):
--pid-file PATH PID file of running daemon (must match mount's --pid-file)
--pid PID PID of the running daemon (e.g., for foreground mode)
Options:
--config-dir Treat config argument as directory of YAML files
Examples:
mkvdup reload --pid-file /run/mkvdup.pid config.yaml
mkvdup reload --pid-file /run/mkvdup.pid --config-dir /etc/mkvdup.d/
mkvdup reload --pid-file /run/mkvdup.pid
mkvdup reload --pid $(pidof mkvdup)
`)
}
func printExpandConfigUsage() {
os.Stdout.WriteString(`Usage: mkvdup expand-config [options] <config-file>
Expand a mount config's include globs into explicit file paths.
Reads a standard mount config file (the same format accepted by mount,
validate, and reload), resolves its includes glob patterns to explicit
paths, and writes an expanded config. All other settings (on_error_command,
virtual_files, top-level mappings) are preserved unchanged. The included
files themselves are not modified and can still contain their own globs.
Arguments:
<config-file> Config file to expand (same format as mount)
Options:
--output PATH Write expanded config to PATH (default: stdout)
--dry-run Preview expanded output without writing
Example input (standard mount config):
includes:
- "/data/isos/dvds/**/*.mkvdup.yaml"
on_error_command:
command: ["curl", "-d", "%source%", "https://ntfy.sh/mkvdup"]
Output (globs resolved, all other settings preserved):
# Auto-generated by: mkvdup expand-config
# Source: /path/to/mount-config.yaml
# Generated: 2026-03-24T12:00:00Z
includes:
- "/data/isos/dvds/movie1/movie1.mkvdup.yaml"
- "/data/isos/dvds/movie2/movie2.mkvdup.yaml"
on_error_command:
command: ["curl", "-d", "%source%", "https://ntfy.sh/mkvdup"]
The output is a drop-in replacement for the original config, usable
directly with mount:
mkvdup mount /mnt/videos expanded-config.yaml
If the --output file already exists and the content is unchanged,
the file is not rewritten (avoiding unnecessary reloads).
Recommended workflow:
1. Keep a mount config with include globs as the source of truth
2. Run 'mkvdup expand-config config.yaml --output expanded.yaml'
3. Point the FUSE mount at expanded.yaml
4. When new .mkvdup.yaml files are added, re-run expand-config
5. Reload the mount: mkvdup reload --pid-file /run/mkvdup.pid
Examples:
mkvdup expand-config mount-config.yaml
mkvdup expand-config mount-config.yaml --output expanded.yaml
mkvdup expand-config --dry-run mount-config.yaml
`)
}
func printRelocateUsage() {
fmt.Print(`Usage: mkvdup relocate [options] <source.mkvdup> <destination>
Move an .mkvdup file and its .mkvdup.yaml sidecar to a new location,
updating relative paths in the sidecar so they resolve to the same
absolute locations from the new position.
Arguments:
<source.mkvdup> Path to the .mkvdup file to move
<destination> Destination path (file or directory)
Options:
--dry-run Preview changes without moving files
--force Overwrite destination if it already exists
If <destination> is an existing directory, the file is moved into that
directory with its original filename. Otherwise, <destination> is used
as the new file path.
The .mkvdup.yaml sidecar (if present) is moved alongside the .mkvdup
file. The dedup_file path is updated to reference the new .mkvdup
location. The source_dir path is recalculated so it resolves to the
same absolute location from the new position (absolute source_dir
paths are preserved unchanged).
Before moving, the command validates that source directories referenced
by the sidecar would remain reachable from the new location. If not,
the move is refused.
Examples:
mkvdup relocate movie.mkvdup /new/location/movie.mkvdup
mkvdup relocate movie.mkvdup /new/location/
mkvdup relocate --dry-run movie.mkvdup /new/location/
mkvdup relocate --force movie.mkvdup /new/location/movie.mkvdup
`)
}
func printDeltadiagUsage() {
fmt.Print(`Usage: mkvdup deltadiag <dedup-file> <mkv-file>
Analyze unmatched (delta) regions in a dedup file by cross-referencing
with the original MKV to determine what stream type each delta region
belongs to (video, audio, or container overhead).
For video delta, further classifies by H.264 NAL type (IDR/non-IDR slices,
SEI, SPS, PPS, etc.) and shows size breakdown.
Works with dedup file versions 3 through 8 (DVD, Blu-ray, and newer).
Arguments:
<dedup-file> Path to the .mkvdup file
<mkv-file> Path to the original MKV file
Examples:
mkvdup deltadiag movie.mkvdup movie.mkv
`)
}
func printParseMKVUsage() {
fmt.Print(`Usage: mkvdup parse-mkv <mkv-file>
Parse an MKV file and display packet information (debugging).
Arguments:
<mkv-file> Path to the MKV file to parse
Examples:
mkvdup parse-mkv movie.mkv
`)
}
func printIndexSourceUsage() {
fmt.Print(`Usage: mkvdup index-source <source-dir>
Index a source directory and display statistics (debugging).
Arguments:
<source-dir> Directory containing source media (ISO files or BDMV folders)
Examples:
mkvdup index-source /media/dvd-backups
`)
}
func printMatchUsage() {
fmt.Print(`Usage: mkvdup match <mkv-file> <source-dir>
Match MKV packets to source and show detailed results (debugging).
Arguments:
<mkv-file> Path to the MKV file
<source-dir> Directory containing source media
Examples:
mkvdup match movie.mkv /media/dvd-backups
`)
}
// Command mkvdup is the CLI tool for MKV-ISO deduplication.
package main
import (
"fmt"
"io"
"log"
"os"
"strconv"
"strings"
"time"
"github.com/stuckj/mkvdup/internal/daemon"
"github.com/stuckj/mkvdup/internal/dedup"
)
// MountOptions holds all options for the mount command.
type MountOptions struct {
AllowOther bool
Foreground bool
ConfigDir bool
PidFile string
DaemonTimeout time.Duration
PermissionsFile string
DefaultUID uint32
DefaultGID uint32
DefaultFileMode uint32
DefaultDirMode uint32
NoSourceWatch bool // Disable source file watching
OnSourceChange string // Action on source change: "warn", "disable", "checksum"
SourceWatchPollInterval time.Duration // Poll interval for network FS source watching (0 = 60s default)
SourceReadTimeout time.Duration // Pread timeout for network FS sources (0 = disabled; CLI default 30s)
OnErrorCommand *dedup.ErrorCommandConfig // External command to run on source integrity error (from YAML config)
NoConfigWatch bool // Disable config file watching
OnConfigChange string // Action on config change: "reload", "warn"
}
// parseUint32 parses a string as uint32.
func parseUint32(s string) (uint32, error) {
v, err := strconv.ParseUint(s, 10, 32)
if err != nil {
return 0, err
}
return uint32(v), nil
}
// parseOctalMode parses a string as an octal file mode.
func parseOctalMode(s string) (uint32, error) {
// Strip leading 0 prefix for octal if present
v, err := strconv.ParseUint(s, 8, 32)
if err != nil {
return 0, err
}
return uint32(v), nil
}
// parseWarnFlags extracts --warn-threshold from args, returning the
// parsed value and the remaining positional arguments.
func parseWarnFlags(args []string) (warnThreshold float64, remaining []string) {
warnThreshold = 75.0
for i := 0; i < len(args); i++ {
switch args[i] {
case "--warn-threshold":
if i+1 < len(args) && !strings.HasPrefix(args[i+1], "--") {
v, err := strconv.ParseFloat(args[i+1], 64)
if err != nil {
log.Fatalf("Error: --warn-threshold invalid: %v", err)
}
if v < 0 || v > 100 {
log.Fatalf("Error: --warn-threshold must be between 0 and 100")
}
warnThreshold = v
i++
} else {
log.Fatalf("Error: --warn-threshold requires a numeric argument")
}
default:
remaining = append(remaining, args[i])
}
}
return
}
// isTerminalStdout returns true if stdout is a terminal (not piped/redirected).
func isTerminalStdout() bool {
fi, err := os.Stdout.Stat()
if err != nil {
return false
}
return fi.Mode()&os.ModeCharDevice != 0
}
// version is set at build time via -ldflags
var version = "dev"
// verbose is set to true when -v flag is passed
var verbose bool
// logVerbose enables verbose diagnostics only in the log file (not on console)
var logVerbose bool
// showProgress controls whether progress bars are rendered. Set to false by
// --no-progress, --quiet, or when stdout is not a TTY.
var showProgress = true
// quiet suppresses all informational stdout output. Errors still go to stderr.
var quiet bool
func printVersion() {
fmt.Printf("mkvdup version %s\n", version)
}
func main() {
// Process global flags before command
args := os.Args[1:]
var filteredArgs []string
showHelp := false
showVersion := false
// Extract --cpuprofile flag (only available in debug builds)
args, cpuprofile := parseCPUProfileFlag(args)
defer startCPUProfile(cpuprofile)()
for i := 0; i < len(args); i++ {
arg := args[i]
switch {
case arg == "-v" || arg == "--verbose":
verbose = true
case arg == "-h" || arg == "--help":
showHelp = true
case arg == "--version":
showVersion = true
case arg == "--log-verbose":
logVerbose = true
case arg == "--no-progress":
showProgress = false
case arg == "-q" || arg == "--quiet":
quiet = true
showProgress = false
case arg == "--log-file":
if i+1 < len(args) {
i++
var err error
logFile, err = os.Create(args[i])
if err != nil {
log.Fatalf("Error: cannot create log file %s: %v", args[i], err)
}
} else {
log.Fatalf("Error: --log-file requires a path argument")
}
default:
filteredArgs = append(filteredArgs, arg)
}
}
args = filteredArgs
// Auto-disable progress bars when stdout is not a TTY
if !isTerminalStdout() {
showProgress = false
}
// Duplicate log package output (used for warnings and fatal errors) to
// the log file so that log.Printf and log.Fatalf messages appear there too.
if logFile != nil {
log.SetOutput(io.MultiWriter(os.Stderr, logFile))
defer logFile.Close()
}
// Handle --version (always top-level)
if showVersion {
printVersion()
os.Exit(0)
}
// If no command given, show appropriate help
if len(args) < 1 {
if showHelp {
printUsage()
os.Exit(0)
}
printUsage()
os.Exit(1)
}
cmd := args[0]
args = args[1:]
// If help flag was given with a command, show command-specific help
if showHelp {
printCommandUsage(cmd)
os.Exit(0)
}
switch cmd {
case "create":
warnThreshold, remaining := parseWarnFlags(args)
nonInteractive := false
var createArgs []string
for i := 0; i < len(remaining); i++ {
switch remaining[i] {
case "--non-interactive":
nonInteractive = true
default:
createArgs = append(createArgs, remaining[i])
}
}
if len(createArgs) < 3 {
printCommandUsage("create")
os.Exit(1)
}
output := createArgs[2]
name := ""
if len(createArgs) >= 4 {
name = createArgs[3]
}
if err := createDedup(createArgs[0], createArgs[1], output, name, warnThreshold, nonInteractive); err != nil {
log.Fatalf("Error: %v", err)
}
case "batch-create":
warnThreshold, remaining := parseWarnFlags(args)
skipCodecMismatch := false
var batchArgs []string
for _, arg := range remaining {
if arg == "--skip-codec-mismatch" {
skipCodecMismatch = true
} else {
batchArgs = append(batchArgs, arg)
}
}
if len(batchArgs) < 1 {
printCommandUsage("batch-create")
os.Exit(1)
}
if err := createBatch(batchArgs[0], warnThreshold, skipCodecMismatch); err != nil {
log.Fatalf("Error: %v", err)
}
case "probe":
if len(args) < 2 {
printCommandUsage("probe")
os.Exit(1)
}
// Split on "--": MKVs before, sources after
// For backward compat: if no "--", first arg is MKV, rest are sources
var mkvPaths, sourceDirs []string
sepIdx := -1
for i, a := range args {
if a == "--" {
sepIdx = i
break
}
}
if sepIdx >= 0 {
mkvPaths = args[:sepIdx]
sourceDirs = args[sepIdx+1:]
} else {
mkvPaths = args[:1]
sourceDirs = args[1:]
}
if len(mkvPaths) == 0 || len(sourceDirs) == 0 {
printCommandUsage("probe")
os.Exit(1)
}
if err := probe(mkvPaths, sourceDirs); err != nil {
log.Fatalf("Error: %v", err)
}
case "mount":
// Parse mount-specific options
allowOther := false
foreground := false
configDir := false
pidFile := ""
daemonTimeout := 30 * time.Second
permissionsFile := ""
defaultUID := uint32(os.Getuid())
defaultGID := uint32(os.Getgid())
defaultFileMode := uint32(0444)
defaultDirMode := uint32(0555)
noSourceWatch := false
onSourceChange := "checksum"
sourceWatchPollInterval := time.Duration(0)
sourceReadTimeout := 30 * time.Second
noConfigWatch := false
onConfigChange := "reload"
var mountArgs []string
for i := 0; i < len(args); i++ {
switch args[i] {
case "--allow-other":
allowOther = true
case "--foreground", "-f":
foreground = true
case "--config-dir":
configDir = true
case "--pid-file":
if i+1 < len(args) && !strings.HasPrefix(args[i+1], "--") {
pidFile = args[i+1]
i++
} else {
log.Fatalf("Error: --pid-file requires a path argument")
}
case "--daemon-timeout":
if i+1 < len(args) && !strings.HasPrefix(args[i+1], "--") {
d, err := time.ParseDuration(args[i+1])
if err != nil {
log.Fatalf("Error: --daemon-timeout invalid duration: %v", err)
}
daemonTimeout = d
i++
} else {
log.Fatalf("Error: --daemon-timeout requires a duration argument (e.g., 30s, 1m)")
}
case "--permissions-file":
if i+1 < len(args) && !strings.HasPrefix(args[i+1], "--") {
permissionsFile = args[i+1]
i++
} else {
log.Fatalf("Error: --permissions-file requires a path argument")
}
case "--default-uid":
if i+1 < len(args) && !strings.HasPrefix(args[i+1], "--") {
uid, err := parseUint32(args[i+1])
if err != nil {
log.Fatalf("Error: --default-uid invalid: %v", err)
}
defaultUID = uid
i++
} else {
log.Fatalf("Error: --default-uid requires a numeric argument")
}
case "--default-gid":
if i+1 < len(args) && !strings.HasPrefix(args[i+1], "--") {
gid, err := parseUint32(args[i+1])
if err != nil {
log.Fatalf("Error: --default-gid invalid: %v", err)
}
defaultGID = gid
i++
} else {
log.Fatalf("Error: --default-gid requires a numeric argument")
}
case "--default-file-mode":
if i+1 < len(args) && !strings.HasPrefix(args[i+1], "--") {
mode, err := parseOctalMode(args[i+1])
if err != nil {
log.Fatalf("Error: --default-file-mode invalid: %v", err)
}
defaultFileMode = mode
i++
} else {
log.Fatalf("Error: --default-file-mode requires an octal mode argument")
}
case "--default-dir-mode":
if i+1 < len(args) && !strings.HasPrefix(args[i+1], "--") {
mode, err := parseOctalMode(args[i+1])
if err != nil {
log.Fatalf("Error: --default-dir-mode invalid: %v", err)
}
defaultDirMode = mode
i++
} else {
log.Fatalf("Error: --default-dir-mode requires an octal mode argument")
}
case "--no-source-watch":
noSourceWatch = true
case "--on-source-change":
if i+1 < len(args) && !strings.HasPrefix(args[i+1], "--") {
onSourceChange = args[i+1]
switch onSourceChange {
case "warn", "disable", "checksum":
// valid
default:
log.Fatalf("Error: --on-source-change must be warn, disable, or checksum")
}
i++
} else {
log.Fatalf("Error: --on-source-change requires an argument (warn, disable, or checksum)")
}
case "--source-watch-poll-interval":
if i+1 < len(args) && !strings.HasPrefix(args[i+1], "--") {
d, err := time.ParseDuration(args[i+1])
if err != nil {
log.Fatalf("Error: --source-watch-poll-interval invalid duration: %v", err)
}
if d <= 0 {
log.Fatalf("Error: --source-watch-poll-interval must be positive")
}
sourceWatchPollInterval = d
i++
} else {
log.Fatalf("Error: --source-watch-poll-interval requires a duration argument (e.g., 10s, 5m)")
}
case "--source-read-timeout":
if i+1 < len(args) && !strings.HasPrefix(args[i+1], "--") {
d, err := time.ParseDuration(args[i+1])
if err != nil {
log.Fatalf("Error: --source-read-timeout invalid duration: %v", err)
}
if d < 0 {
log.Fatalf("Error: --source-read-timeout must be non-negative")
}
sourceReadTimeout = d
i++
} else {
log.Fatalf("Error: --source-read-timeout requires a duration argument (e.g., 30s, 1m)")
}
case "--no-config-watch":
noConfigWatch = true
case "--on-config-change":
if i+1 < len(args) && !strings.HasPrefix(args[i+1], "--") {
onConfigChange = args[i+1]
switch onConfigChange {
case "reload", "warn":
// valid
default:
log.Fatalf("Error: --on-config-change must be reload or warn")
}
i++
} else {
log.Fatalf("Error: --on-config-change requires an argument (reload or warn)")
}
default:
mountArgs = append(mountArgs, args[i])
}
}
if len(mountArgs) < 1 {
printCommandUsage("mount")
os.Exit(1)
}
mountpoint := mountArgs[0]
configPaths := mountArgs[1:]
mountOpts := MountOptions{
AllowOther: allowOther,
Foreground: foreground,
ConfigDir: configDir,
PidFile: pidFile,
DaemonTimeout: daemonTimeout,
PermissionsFile: permissionsFile,
DefaultUID: defaultUID,
DefaultGID: defaultGID,
DefaultFileMode: defaultFileMode,
DefaultDirMode: defaultDirMode,
NoSourceWatch: noSourceWatch,
OnSourceChange: onSourceChange,
SourceWatchPollInterval: sourceWatchPollInterval,
SourceReadTimeout: sourceReadTimeout,
NoConfigWatch: noConfigWatch,
OnConfigChange: onConfigChange,
}
if err := mountFuse(mountpoint, configPaths, mountOpts); err != nil {
log.Fatalf("Error: %v", err)
}
case "info":
hideUnused := false
var infoArgs []string
for _, a := range args {
if a == "--hide-unused-files" {
hideUnused = true
} else {
infoArgs = append(infoArgs, a)
}
}
if len(infoArgs) < 1 {
printCommandUsage("info")
os.Exit(1)
}
if err := showInfo(infoArgs[0], hideUnused); err != nil {
log.Fatalf("Error: %v", err)
}
case "verify":
if len(args) < 3 {
printCommandUsage("verify")
os.Exit(1)
}
if err := verifyDedup(args[0], args[1], args[2]); err != nil {
log.Fatalf("Error: %v", err)
}
case "extract":
if len(args) < 3 {
printCommandUsage("extract")
os.Exit(1)
}
if err := extractDedup(args[0], args[1], args[2]); err != nil {
log.Fatalf("Error: %v", err)
}
case "check":
sourceChecksums := false
var checkArgs []string
for i := 0; i < len(args); i++ {
switch args[i] {
case "--source-checksums":
sourceChecksums = true
default:
checkArgs = append(checkArgs, args[i])
}
}
if len(checkArgs) < 2 {
printCommandUsage("check")
os.Exit(1)
}
if err := checkDedup(checkArgs[0], checkArgs[1], sourceChecksums); err != nil {
log.Fatalf("Error: %v", err)
}
case "stats":
configDir := false
var statsArgs []string
for _, arg := range args {
if arg == "--config-dir" {
configDir = true
} else {
statsArgs = append(statsArgs, arg)
}
}
if len(statsArgs) < 1 {
printCommandUsage("stats")
os.Exit(1)
}
if err := showStats(statsArgs, configDir); err != nil {
log.Fatalf("Error: %v", err)
}
case "validate":
configDir := false
deep := false
strict := false
var valArgs []string
for i := 0; i < len(args); i++ {
switch args[i] {
case "--config-dir":
configDir = true
case "--deep":
deep = true
case "--strict":
strict = true
default:
valArgs = append(valArgs, args[i])
}
}
if len(valArgs) < 1 {
printCommandUsage("validate")
os.Exit(1)
}
os.Exit(validateConfigs(valArgs, configDir, deep, strict))
case "reload":
if len(args) == 0 {
printCommandUsage("reload")
os.Exit(1)
}
pidFile := ""
pidDirect := 0
configDir := false
var reloadArgs []string
for i := 0; i < len(args); i++ {
switch args[i] {
case "--pid-file":
if i+1 < len(args) && !strings.HasPrefix(args[i+1], "--") {
pidFile = args[i+1]
i++
} else {
log.Fatalf("Error: --pid-file requires a path argument")
}
case "--pid":
if i+1 < len(args) && !strings.HasPrefix(args[i+1], "--") {
p, err := strconv.Atoi(args[i+1])
if err != nil || p <= 0 {
log.Fatalf("Error: --pid requires a positive integer argument")
}
pidDirect = p
i++
} else {
log.Fatalf("Error: --pid requires a PID argument")
}
case "--config-dir":
configDir = true
default:
reloadArgs = append(reloadArgs, args[i])
}
}
if pidFile != "" && pidDirect != 0 {
log.Fatalf("Error: --pid-file and --pid are mutually exclusive")
}
var pid int
if pidDirect != 0 {
pid = pidDirect
} else if pidFile != "" {
var err error
pid, err = daemon.ReadPidFile(pidFile)
if err != nil {
log.Fatalf("Error: %v", err)
}
} else {
log.Fatalf("Error: --pid-file or --pid is required for reload")
}
if err := reloadDaemon(pid, reloadArgs, configDir); err != nil {
log.Fatalf("Error: %v", err)
}
case "expand-config":
outputPath := ""
dryRun := false
var expandArgs []string
for i := 0; i < len(args); i++ {
switch args[i] {
case "--output":
if i+1 < len(args) && !strings.HasPrefix(args[i+1], "--") {
outputPath = args[i+1]
i++
} else {
log.Fatalf("Error: --output requires a path argument")
}
case "--dry-run":
dryRun = true
default:
expandArgs = append(expandArgs, args[i])
}
}
if len(expandArgs) != 1 {
printCommandUsage("expand-config")
os.Exit(1)
}
if err := expandConfigCmd(expandArgs[0], outputPath, dryRun); err != nil {
log.Fatalf("Error: %v", err)
}
case "relocate":
force := false
dryRun := false
var relocateArgs []string
for i := 0; i < len(args); i++ {
switch args[i] {
case "--force":
force = true
case "--dry-run":
dryRun = true
default:
relocateArgs = append(relocateArgs, args[i])
}
}
if len(relocateArgs) != 2 {
printCommandUsage("relocate")
os.Exit(1)
}
if err := relocateDedup(relocateArgs[0], relocateArgs[1], force, dryRun); err != nil {
log.Fatalf("Error: %v", err)
}
case "deltadiag":
if len(args) < 2 {
printCommandUsage("deltadiag")
os.Exit(1)
}
if err := deltadiag(args[0], args[1]); err != nil {
log.Fatalf("Error: %v", err)
}
case "parse-mkv":
if len(args) < 1 {
printCommandUsage("parse-mkv")
os.Exit(1)
}
if err := parseMKV(args[0]); err != nil {
log.Fatalf("Error: %v", err)
}
case "index-source":
if len(args) < 1 {
printCommandUsage("index-source")
os.Exit(1)
}
if err := indexSource(args[0]); err != nil {
log.Fatalf("Error: %v", err)
}
case "match":
if len(args) < 2 {
printCommandUsage("match")
os.Exit(1)
}
if err := matchMKV(args[0], args[1]); err != nil {
log.Fatalf("Error: %v", err)
}
case "help":
if len(args) > 0 {
printCommandUsage(args[0])
} else {
printUsage()
}
os.Exit(0)
default:
printWarn("Unknown command: %s\n\n", cmd)
printUsage()
os.Exit(1)
}
}
//go:build !debug
package main
// parseCPUProfileFlag is a no-op in release builds.
// The --cpuprofile flag is only available in debug builds (go build -tags debug).
func parseCPUProfileFlag(args []string) ([]string, string) {
return args, ""
}
// debugOptionsHelp returns empty string in release builds.
func debugOptionsHelp() string {
return ""
}
// startCPUProfile is a no-op in release builds.
func startCPUProfile(_ string) func() {
return func() {}
}
package main
import (
"fmt"
"io"
"os"
"strings"
"time"
)
// logFile is set by --log-file to duplicate output to a file.
// Console output is unchanged; the log file receives non-TTY-style output
// (milestones instead of progress bars, no ANSI escape sequences).
var logFile *os.File
// progressBar renders an in-place progress bar with ETA.
//
// When showProgress is true (TTY mode), it renders:
//
// Phase 2/6: Building source index...
// [████████████████████░░░░░░░░░░░░░░░░░░░░] 52% 2.3 GB / 4.5 GB ETA: 00:00:14
//
// On Finish(), the bar line is cleared and replaced with:
//
// Phase 2/6: Building source index... done (00:00:27)
//
// When showProgress is false (non-TTY), milestone percentages are printed at
// 10% intervals so redirected logs still show progress.
// When quiet is true, nothing is printed to stdout (log file still receives output).
type progressBar struct {
prefix string
total int64
processed int64
startTime time.Time
lastDraw time.Time
unit string // "bytes" or "packets"
done bool
lastMilestone int // last 10% milestone printed (0-10)
}
const barWidth = 40
// newProgressBar creates and displays a new progress bar.
// The prefix (e.g., "Phase 2/6: Building source index...") is printed immediately.
// Unit should be "bytes" or "packets".
func newProgressBar(prefix string, total int64, unit string) *progressBar {
p := &progressBar{
prefix: prefix,
total: total,
unit: unit,
startTime: time.Now(),
}
if !quiet {
fmt.Println(prefix)
}
if logFile != nil {
fmt.Fprintln(logFile, prefix)
}
return p
}
// Update sets the current progress and redraws the bar (throttled to 500ms).
func (p *progressBar) Update(processed int64) {
if p.done {
return
}
p.processed = processed
// Milestone progress for non-TTY stdout and/or log file
if (!showProgress && !quiet) || logFile != nil {
p.updateMilestone()
}
if quiet || !showProgress {
return
}
if time.Since(p.lastDraw) < 500*time.Millisecond {
return
}
p.lastDraw = time.Now()
p.draw()
}
// updateMilestone prints percentage milestones at 10% intervals.
// Output goes to stdout (when non-TTY) and/or the log file.
func (p *progressBar) updateMilestone() {
if p.total <= 0 {
return
}
pct := float64(p.processed) / float64(p.total) * 100
milestone := int(pct / 10)
if milestone > 10 {
milestone = 10
}
if milestone <= p.lastMilestone {
return
}
p.lastMilestone = milestone
elapsed := time.Since(p.startTime)
line := fmt.Sprintf(" %d%% (%s)\n", milestone*10, formatDuration(elapsed))
if !showProgress && !quiet {
fmt.Print(line)
}
if logFile != nil {
fmt.Fprint(logFile, line)
}
}
// Cancel cleans up a progress bar on error without printing "done".
// It prints a newline to move past any partial bar line. Safe to call
// after Finish() (no-op if already done).
func (p *progressBar) Cancel() {
if p.done {
return
}
p.done = true
if !quiet && showProgress {
// Clear partial bar line and move to next line
fmt.Print("\r\033[2K\n")
}
}
// Finish completes the progress bar and prints the elapsed time.
func (p *progressBar) Finish() {
if p.done {
return
}
p.done = true
elapsed := time.Since(p.startTime)
if !quiet {
if showProgress {
// Clear the bar line, move up, and overwrite the prefix line with completion
fmt.Printf("\r\033[2K\033[A\r\033[2K%s done (%s)\n", p.prefix, formatDuration(elapsed))
} else {
fmt.Printf("%s done (%s)\n", p.prefix, formatDuration(elapsed))
}
}
if logFile != nil {
fmt.Fprintf(logFile, "%s done (%s)\n", p.prefix, formatDuration(elapsed))
}
}
// draw renders the progress bar line.
func (p *progressBar) draw() {
if p.total <= 0 {
return
}
pct := float64(p.processed) / float64(p.total)
if pct > 1.0 {
pct = 1.0
}
// Build the bar: [████████░░░░░░░░░░░░]
filled := int(pct * float64(barWidth))
if filled > barWidth {
filled = barWidth
}
bar := strings.Repeat("█", filled) + strings.Repeat("░", barWidth-filled)
// Build the stats portion
var stats string
switch p.unit {
case "bytes":
stats = fmt.Sprintf("%s / %s", formatSize(p.processed), formatSize(p.total))
case "packets":
stats = fmt.Sprintf("%s / %s", formatInt(p.processed), formatInt(p.total))
}
// ETA
eta := p.eta()
line := fmt.Sprintf(" [%s] %3.0f%% %s ETA: %s", bar, pct*100, stats, eta)
fmt.Printf("\r\033[2K%s", line)
}
// eta calculates the estimated time remaining.
func (p *progressBar) eta() string {
elapsed := time.Since(p.startTime)
// Don't show ETA for first 2 seconds or when no progress
if elapsed < 2*time.Second || p.processed <= 0 || p.total <= 0 {
return "--:--:--"
}
rate := float64(p.processed) / elapsed.Seconds()
remaining := float64(p.total-p.processed) / rate
if remaining < 0 {
remaining = 0
}
return formatDuration(time.Duration(remaining * float64(time.Second)))
}
// formatDuration formats a duration as HH:MM:SS.
func formatDuration(d time.Duration) string {
d = d.Round(time.Second)
h := int(d.Hours())
m := int(d.Minutes()) % 60
s := int(d.Seconds()) % 60
return fmt.Sprintf("%02d:%02d:%02d", h, m, s)
}
// formatSize formats a byte count as a human-readable string.
func formatSize(n int64) string {
const (
kb = 1024
mb = 1024 * kb
gb = 1024 * mb
)
switch {
case n >= gb:
return fmt.Sprintf("%.1f GB", float64(n)/float64(gb))
case n >= mb:
return fmt.Sprintf("%.1f MB", float64(n)/float64(mb))
case n >= kb:
return fmt.Sprintf("%.1f KB", float64(n)/float64(kb))
default:
return fmt.Sprintf("%d B", n)
}
}
// printInfo prints informational output, suppressed on stdout when quiet is true.
// Always written to logFile if one is open.
func printInfo(format string, a ...any) {
if !quiet {
fmt.Printf(format, a...)
}
if logFile != nil {
fmt.Fprintf(logFile, format, a...)
}
}
// verboseWriter returns the io.Writer for verbose diagnostic output based on
// the current flag configuration:
// - verbose + logFile → both stderr and log file (MultiWriter)
// - verbose only → stderr
// - logVerbose + logFile → log file only
// - otherwise → nil (verbose disabled)
func verboseWriter() io.Writer {
if verbose {
if logFile != nil {
return io.MultiWriter(os.Stderr, logFile)
}
return os.Stderr
}
if logVerbose && logFile != nil {
return logFile
}
return nil
}
// printInfoln prints informational output with a newline, suppressed on stdout when quiet is true.
// Always written to logFile if one is open.
func printInfoln(a ...any) {
if !quiet {
fmt.Println(a...)
}
if logFile != nil {
fmt.Fprintln(logFile, a...)
}
}
// printWarn prints warning/error output to stderr. Always written to logFile if one is open.
func printWarn(format string, a ...any) {
fmt.Fprintf(os.Stderr, format, a...)
if logFile != nil {
fmt.Fprintf(logFile, format, a...)
}
}
// printWarnln prints warning/error output with a newline to stderr. Always written to logFile if one is open.
func printWarnln(a ...any) {
fmt.Fprintln(os.Stderr, a...)
if logFile != nil {
fmt.Fprintln(logFile, a...)
}
}
// Package daemon provides daemonization support for mkvdup FUSE mount.
//
// It uses a re-exec pattern where the parent process spawns a child with
// an environment variable marker. The child signals readiness to the parent
// via a pipe, allowing the parent to return success/failure appropriately.
package daemon
import (
"errors"
"fmt"
"io"
"os"
"os/exec"
"strconv"
"strings"
"syscall"
"time"
"golang.org/x/sys/unix"
)
// childEnvVar is the environment variable that marks a child daemon process.
const childEnvVar = "MKVDUP_DAEMON_CHILD"
// readyPipeFdEnvVar is the environment variable containing the pipe fd for signaling.
const readyPipeFdEnvVar = "MKVDUP_READY_PIPE_FD"
// Status codes sent from child to parent via the ready pipe.
const (
statusReady byte = 0 // Mount successful
statusError byte = 1 // Mount failed
)
// IsChild returns true if the current process is a daemon child.
func IsChild() bool {
return os.Getenv(childEnvVar) == "1"
}
// Daemonize spawns the current executable as a background daemon.
// It waits for the child to signal readiness or error via a pipe.
// Returns nil on success (child signaled ready) or error on failure.
// The timeout specifies how long to wait for the child to signal.
func Daemonize(pidFile string, timeout time.Duration) error {
// Create pipe for child to signal readiness
readPipe, writePipe, err := os.Pipe()
if err != nil {
return fmt.Errorf("create pipe: %w", err)
}
defer readPipe.Close()
// Build command with same arguments
cmd := exec.Command(os.Args[0], os.Args[1:]...)
// Set up environment
cmd.Env = append(os.Environ(),
childEnvVar+"=1",
readyPipeFdEnvVar+"=3", // fd 3 is after stdin/stdout/stderr
)
// Pass write end of pipe to child as fd 3
cmd.ExtraFiles = []*os.File{writePipe}
// Detach from terminal
cmd.Stdin = nil
cmd.Stdout = nil
cmd.Stderr = nil
cmd.SysProcAttr = &syscall.SysProcAttr{
Setsid: true, // Create new session
}
// Start child process
if err := cmd.Start(); err != nil {
writePipe.Close()
return fmt.Errorf("start daemon: %w", err)
}
// Close write end in parent (child has it)
writePipe.Close()
// Wait for child to signal with timeout
resultChan := make(chan error, 1)
go func() {
status := make([]byte, 1)
n, err := readPipe.Read(status)
if err != nil {
if errors.Is(err, io.EOF) {
resultChan <- fmt.Errorf("daemon child exited unexpectedly")
} else {
resultChan <- fmt.Errorf("read from child: %w", err)
}
return
}
if n != 1 {
resultChan <- fmt.Errorf("unexpected read size from child: %d", n)
return
}
if status[0] == statusReady {
resultChan <- nil
} else {
// Read full error message until EOF to avoid truncation
errMsg, readErr := io.ReadAll(readPipe)
if readErr != nil && !errors.Is(readErr, io.EOF) {
resultChan <- fmt.Errorf("daemon failed (error reading message): %v", readErr)
return
}
if len(errMsg) > 0 {
resultChan <- fmt.Errorf("daemon failed: %s", string(errMsg))
} else {
resultChan <- fmt.Errorf("daemon failed with unknown error")
}
}
}()
select {
case err := <-resultChan:
if err != nil {
// Try to clean up the child
if cmd.Process != nil {
cmd.Process.Kill()
}
return err
}
// Success - child is running and mount is ready
if pidFile != "" {
// Write PID file from parent since child may not have permission
if err := WritePidFile(pidFile, cmd.Process.Pid); err != nil {
fmt.Fprintf(os.Stderr, "warning: failed to write pid file: %v\n", err)
}
}
return nil
case <-time.After(timeout):
// Close pipe to unblock the goroutine waiting on Read()
readPipe.Close()
if cmd.Process != nil {
cmd.Process.Kill()
}
return fmt.Errorf("daemon startup timed out after %v", timeout)
}
}
// NotifyReady signals to the parent that the mount is ready.
// This should be called by the child after the FUSE mount is ready.
func NotifyReady() error {
fd, err := getReadyPipeFd()
if err != nil {
return err
}
pipe := os.NewFile(fd, "ready-pipe")
if pipe == nil {
return fmt.Errorf("invalid pipe fd")
}
defer pipe.Close()
_, err = pipe.Write([]byte{statusReady})
return err
}
// NotifyError signals to the parent that the mount failed.
// This should be called by the child if an error occurs during startup.
func NotifyError(mountErr error) error {
fd, err := getReadyPipeFd()
if err != nil {
return err
}
pipe := os.NewFile(fd, "ready-pipe")
if pipe == nil {
return fmt.Errorf("invalid pipe fd")
}
defer pipe.Close()
// Write error status followed by error message
_, err = pipe.Write([]byte{statusError})
if err != nil {
return err
}
_, err = pipe.Write([]byte(mountErr.Error()))
return err
}
// getReadyPipeFd returns the file descriptor for the ready pipe.
func getReadyPipeFd() (uintptr, error) {
fdStr := os.Getenv(readyPipeFdEnvVar)
if fdStr == "" {
return 0, fmt.Errorf("not running as daemon child")
}
fd, err := strconv.ParseUint(fdStr, 10, strconv.IntSize)
if err != nil {
return 0, fmt.Errorf("invalid pipe fd: %w", err)
}
return uintptr(fd), nil
}
// Detach closes stdin, stdout, and stderr to fully detach from the terminal.
// This should be called by the child after signaling ready.
func Detach() {
// Redirect standard file descriptors to /dev/null
devNull, err := os.OpenFile("/dev/null", os.O_RDWR, 0)
if err != nil {
fmt.Fprintf(os.Stderr, "daemon: failed to open /dev/null: %v\n", err)
return
}
// Replace stdin, stdout, stderr with /dev/null
// Use unix.Dup2 for cross-architecture compatibility (syscall.Dup2 not available on arm64)
// Errors are logged but not fatal since the daemon can still function
if err := unix.Dup2(int(devNull.Fd()), int(os.Stdin.Fd())); err != nil {
fmt.Fprintf(os.Stderr, "daemon: failed to redirect stdin: %v\n", err)
}
if err := unix.Dup2(int(devNull.Fd()), int(os.Stdout.Fd())); err != nil {
fmt.Fprintf(os.Stderr, "daemon: failed to redirect stdout: %v\n", err)
}
if err := unix.Dup2(int(devNull.Fd()), int(os.Stderr.Fd())); err != nil {
// stderr may already be redirected, best effort
_ = err
}
if err := devNull.Close(); err != nil {
// Can't log since stderr may be redirected
_ = err
}
}
// WritePidFile writes the given PID to a file.
func WritePidFile(path string, pid int) error {
return os.WriteFile(path, []byte(strconv.Itoa(pid)+"\n"), 0644)
}
// RemovePidFile removes the PID file at the given path.
func RemovePidFile(path string) error {
return os.Remove(path)
}
// ReadPidFile reads a PID from the given file path.
func ReadPidFile(path string) (int, error) {
data, err := os.ReadFile(path)
if err != nil {
return 0, fmt.Errorf("read pid file: %w", err)
}
pidStr := strings.TrimSpace(string(data))
pid, err := strconv.Atoi(pidStr)
if err != nil {
return 0, fmt.Errorf("invalid pid in %s: %w", path, err)
}
if pid <= 0 {
return 0, fmt.Errorf("invalid pid %d in %s", pid, path)
}
return pid, nil
}
package dedup
import (
"fmt"
"log"
"os"
"path/filepath"
"sort"
"strings"
"time"
"github.com/bmatcuk/doublestar/v4"
"github.com/stuckj/mkvdup/internal/security"
"gopkg.in/yaml.v3"
)
// Config represents the contents of a .mkvdup.yaml file.
type Config struct {
Name string `yaml:"name"`
DedupFile string `yaml:"dedup_file"`
SourceDir string `yaml:"source_dir"`
}
// configFile is the internal YAML representation that supports includes
// and virtual_files in addition to the standard Config fields.
type configFile struct {
Name string `yaml:"name,omitempty"`
DedupFile string `yaml:"dedup_file,omitempty"`
SourceDir string `yaml:"source_dir,omitempty"`
Includes []string `yaml:"includes,omitempty"`
VirtualFiles []Config `yaml:"virtual_files,omitempty"`
OnErrorCommand *ErrorCommandConfig `yaml:"on_error_command,omitempty"`
}
// ErrorCommandConfig configures an external command to run when a source
// integrity issue is detected. Placeholders in command arguments (%source%,
// %files%, %event%) are substituted at runtime.
type ErrorCommandConfig struct {
Command CommandValue `yaml:"command"`
Timeout time.Duration `yaml:"timeout,omitempty"`
BatchInterval time.Duration `yaml:"batch_interval,omitempty"`
}
// applyDefaults fills in zero-value fields with sensible defaults.
func (c *ErrorCommandConfig) applyDefaults() {
if c.Timeout <= 0 {
c.Timeout = 30 * time.Second
}
if c.BatchInterval <= 0 {
c.BatchInterval = 5 * time.Second
}
}
// CommandValue supports both string and []string YAML formats.
// A string value is executed via "sh -c"; a list is executed directly.
type CommandValue struct {
IsShell bool // true if the original YAML was a string (run via sh -c)
Args []string // for shell: single element with the command string; for list: the arg list
}
// UnmarshalYAML implements custom unmarshaling for CommandValue.
func (c *CommandValue) UnmarshalYAML(value *yaml.Node) error {
// Try string first
if value.Kind == yaml.ScalarNode {
var s string
if err := value.Decode(&s); err != nil {
return err
}
if s == "" {
return fmt.Errorf("on_error_command: command must not be empty")
}
c.IsShell = true
c.Args = []string{s}
return nil
}
// Try list
if value.Kind == yaml.SequenceNode {
var list []string
if err := value.Decode(&list); err != nil {
return err
}
if len(list) == 0 {
return fmt.Errorf("on_error_command: command list must not be empty")
}
c.IsShell = false
c.Args = list
return nil
}
return fmt.Errorf("on_error_command: command must be a string or list of strings")
}
// MarshalYAML implements custom marshaling for CommandValue.
// Shell commands (string form) are emitted as a scalar; list commands as a sequence.
func (c CommandValue) MarshalYAML() (interface{}, error) {
if c.IsShell && len(c.Args) == 1 {
return c.Args[0], nil
}
return c.Args, nil
}
// WriteConfig writes the .mkvdup.yaml config file.
func WriteConfig(configPath, name, dedupFile, sourceDir string) error {
content := fmt.Sprintf(`# Auto-generated by mkvdup create
name: %q
dedup_file: %q
source_dir: %q
`, name, dedupFile, sourceDir)
return os.WriteFile(configPath, []byte(content), 0644)
}
// ReadConfig reads a .mkvdup.yaml config file.
func ReadConfig(configPath string) (*Config, error) {
data, err := os.ReadFile(configPath)
if err != nil {
return nil, fmt.Errorf("read config file: %w", err)
}
var config Config
if err := yaml.Unmarshal(data, &config); err != nil {
return nil, fmt.Errorf("parse config %s: %w", configPath, err)
}
if config.Name == "" || config.DedupFile == "" || config.SourceDir == "" {
return nil, fmt.Errorf("invalid config: missing required fields")
}
return &config, nil
}
// ResolveConfigs reads config files and recursively expands includes and
// virtual_files into a flat list of Config entries. Cycle detection prevents
// infinite recursion from circular includes.
//
// If any config file contains an on_error_command block, the first one
// encountered (depth-first, in file order) is returned. Defaults are applied
// for omitted timeout and batch_interval fields.
//
// The returned loadedPaths slice contains the absolute, symlink-resolved paths
// of every config file that was successfully loaded (the keys of the cycle-detection
// set). This is useful for setting up file watchers on the loaded configs.
func ResolveConfigs(configPaths []string) (configs []Config, errorCmd *ErrorCommandConfig, loadedPaths []string, err error) {
seen := make(map[string]bool)
for _, p := range configPaths {
cfgs, cmd, resolveErr := resolveConfig(p, seen)
if resolveErr != nil {
return nil, nil, nil, resolveErr
}
configs = append(configs, cfgs...)
if errorCmd == nil && cmd != nil {
errorCmd = cmd
}
}
if errorCmd != nil {
if len(errorCmd.Command.Args) == 0 {
return nil, nil, nil, fmt.Errorf("invalid on_error_command: missing command")
}
errorCmd.applyDefaults()
}
loadedPaths = make([]string, 0, len(seen))
for p := range seen {
loadedPaths = append(loadedPaths, p)
}
sort.Strings(loadedPaths)
return configs, errorCmd, loadedPaths, nil
}
// configVisitor is called for each config file visited during the walk.
// realPath is the resolved absolute path of the config file.
// cf is the parsed config file contents.
// configDir is the directory containing the config file (for resolving relative paths).
// phase indicates when the visitor is called:
// - "pre" — before recursing into includes (for top-level config entries)
// - "post" — after includes have been recursed (for virtual_files)
type configVisitor func(phase string, realPath string, cf *configFile, configDir string) error
// walkConfig recursively walks a config file tree, calling the visitor for each
// file. It handles path resolution, symlink resolution, cycle detection,
// ownership checks, YAML parsing, and includes glob expansion — the shared
// logic that all config resolution uses.
//
// The visitor is called twice per file: once with phase "pre" before processing
// includes, and once with phase "post" after. This preserves the original
// ordering: top-level entries → included configs → virtual_files.
// openConfigFile resolves a config path (abs, symlinks, ownership check),
// reads and parses the YAML. Returns the canonical real path, raw data,
// parsed config, and any error.
func openConfigFile(configPath string) (realPath string, data []byte, cf *configFile, err error) {
absPath, err := filepath.Abs(configPath)
if err != nil {
return "", nil, nil, fmt.Errorf("resolve path %s: %w", configPath, err)
}
realPath, err = filepath.EvalSymlinks(absPath)
if err != nil {
return "", nil, nil, fmt.Errorf("resolve symlinks %s: %w", absPath, err)
}
if err := security.CheckFileOwnershipResolved(realPath); err != nil {
return "", nil, nil, fmt.Errorf("config file %s: %w", realPath, err)
}
data, err = os.ReadFile(realPath)
if err != nil {
return "", nil, nil, fmt.Errorf("read config file %s: %w", realPath, err)
}
cf = &configFile{}
if err := yaml.Unmarshal(data, cf); err != nil {
return "", nil, nil, fmt.Errorf("parse config %s: %w", realPath, err)
}
return realPath, data, cf, nil
}
// validateConfigFields checks for partial top-level fields and invalid
// virtual_files entries. Returns an error if validation fails.
func validateConfigFields(realPath string, cf *configFile) error {
hasName := cf.Name != ""
hasDedup := cf.DedupFile != ""
hasSource := cf.SourceDir != ""
if (hasName || hasDedup || hasSource) && !(hasName && hasDedup && hasSource) {
return fmt.Errorf("config %s: name, dedup_file, and source_dir must all be set if any is set", realPath)
}
for _, vf := range cf.VirtualFiles {
if vf.Name == "" || vf.DedupFile == "" || vf.SourceDir == "" {
return fmt.Errorf("config %s: virtual_files entry missing required fields (name, dedup_file, source_dir)", realPath)
}
}
return nil
}
func walkConfig(configPath string, seen map[string]bool, visit configVisitor) error {
// openConfigFile resolves abs + symlinks, checks ownership, reads, parses.
realPath, _, cf, err := openConfigFile(configPath)
if err != nil {
return err
}
// Cycle detection using the canonical path from openConfigFile.
if seen[realPath] {
log.Printf("warning: skipping already-seen config %s (cycle detection)", realPath)
return nil
}
seen[realPath] = true
configDir := filepath.Dir(realPath)
// Pre-includes visit (top-level config entries).
if err := visit("pre", realPath, cf, configDir); err != nil {
return err
}
// Recurse into includes.
for _, pattern := range cf.Includes {
pattern = resolveRelative(configDir, pattern)
matches, err := doublestar.FilepathGlob(pattern)
if err != nil {
return fmt.Errorf("expand include pattern %q in %s: %w", pattern, realPath, err)
}
sort.Strings(matches)
for _, match := range matches {
if err := walkConfig(match, seen, visit); err != nil {
return err
}
}
}
// Post-includes visit (virtual_files).
if err := visit("post", realPath, cf, configDir); err != nil {
return err
}
return nil
}
// resolveConfig recursively resolves a single config file using walkConfig.
// Returns the file configs and the first on_error_command encountered (or nil).
func resolveConfig(configPath string, seen map[string]bool) ([]Config, *ErrorCommandConfig, error) {
var configs []Config
var errorCmd *ErrorCommandConfig
err := walkConfig(configPath, seen, func(phase, realPath string, cf *configFile, configDir string) error {
if phase == "pre" {
// Capture on_error_command from this file (first-wins across resolution).
if errorCmd == nil && cf.OnErrorCommand != nil {
errorCmd = cf.OnErrorCommand
}
// Validate fields and add top-level config entry if present.
if err := validateConfigFields(realPath, cf); err != nil {
return err
}
if cf.Name != "" && cf.DedupFile != "" && cf.SourceDir != "" {
configs = append(configs, Config{
Name: cf.Name,
DedupFile: resolveRelative(configDir, cf.DedupFile),
SourceDir: resolveRelative(configDir, cf.SourceDir),
})
}
} else {
// Process virtual_files after includes have been resolved.
// Validation was already done in the "pre" phase.
for _, vf := range cf.VirtualFiles {
configs = append(configs, Config{
Name: vf.Name,
DedupFile: resolveRelative(configDir, vf.DedupFile),
SourceDir: resolveRelative(configDir, vf.SourceDir),
})
}
}
return nil
})
return configs, errorCmd, err
}
// BatchManifest represents the batch create manifest file format.
type BatchManifest struct {
SourceDir string `yaml:"source_dir"`
Files []BatchManifestFile `yaml:"files"`
}
// BatchManifestFile represents a single file entry in a batch manifest.
type BatchManifestFile struct {
MKV string `yaml:"mkv"`
Output string `yaml:"output"`
Name string `yaml:"name"`
SourceDir string `yaml:"source_dir"`
}
// ReadBatchManifest reads and validates a batch manifest file.
// Relative paths are resolved against the manifest file's directory.
// Default values are applied for optional fields.
func ReadBatchManifest(manifestPath string) (*BatchManifest, error) {
data, err := os.ReadFile(manifestPath)
if err != nil {
return nil, fmt.Errorf("read batch manifest: %w", err)
}
var manifest BatchManifest
if err := yaml.Unmarshal(data, &manifest); err != nil {
return nil, fmt.Errorf("parse batch manifest %s: %w", manifestPath, err)
}
if len(manifest.Files) == 0 {
return nil, fmt.Errorf("batch manifest %s: files list is empty", manifestPath)
}
absPath, err := filepath.Abs(manifestPath)
if err != nil {
return nil, fmt.Errorf("resolve manifest path: %w", err)
}
manifestDir := filepath.Dir(absPath)
// Resolve and normalize top-level source_dir relative to manifest (if set)
if manifest.SourceDir != "" {
manifest.SourceDir = filepath.Clean(resolveRelative(manifestDir, manifest.SourceDir))
}
// Validate and resolve each file entry
for i := range manifest.Files {
f := &manifest.Files[i]
if f.MKV == "" {
return nil, fmt.Errorf("batch manifest %s: files[%d] missing required 'mkv' field", manifestPath, i)
}
f.MKV = resolveRelative(manifestDir, f.MKV)
if f.Output == "" {
return nil, fmt.Errorf("batch manifest %s: files[%d] missing required 'output' field", manifestPath, i)
}
f.Output = resolveRelative(manifestDir, f.Output)
// Default name to MKV basename
if f.Name == "" {
f.Name = filepath.Base(f.MKV)
}
// Ensure name has .mkv extension
if !strings.HasSuffix(strings.ToLower(f.Name), ".mkv") {
f.Name += ".mkv"
}
// Resolve and normalize per-file source_dir, fall back to top-level default
if f.SourceDir != "" {
f.SourceDir = filepath.Clean(resolveRelative(manifestDir, f.SourceDir))
} else if manifest.SourceDir != "" {
f.SourceDir = manifest.SourceDir
} else {
return nil, fmt.Errorf("batch manifest %s: files[%d] has no source_dir (set per-file or top-level default)", manifestPath, i)
}
}
return &manifest, nil
}
// resolveRelative resolves a path relative to baseDir. If the path is already
// absolute, it is returned unchanged.
func resolveRelative(baseDir, path string) string {
if filepath.IsAbs(path) {
return path
}
return filepath.Join(baseDir, path)
}
package dedup
import (
"fmt"
"path/filepath"
"sort"
"github.com/bmatcuk/doublestar/v4"
"gopkg.in/yaml.v3"
)
// resolveIncludePaths reads standard config files and resolves their includes
// glob patterns into a sorted, deduplicated list of absolute file paths.
// It can be used to compute the explicit set of config files that contribute
// mappings from a wildcard-based configuration.
func resolveIncludePaths(configPaths []string) ([]string, error) {
seen := make(map[string]bool)
var files []string
for _, configPath := range configPaths {
err := walkConfig(configPath, seen, func(phase, realPath string, cf *configFile, _ string) error {
if phase != "pre" {
return nil
}
if err := validateConfigFields(realPath, cf); err != nil {
return err
}
// Collect paths of configs that contribute any mappings.
hasDirectMapping := cf.Name != "" && cf.DedupFile != "" && cf.SourceDir != ""
if hasDirectMapping || len(cf.VirtualFiles) > 0 {
files = append(files, realPath)
}
return nil
})
if err != nil {
return nil, err
}
}
sort.Strings(files)
return files, nil
}
// ExpandConfigFile reads a config file, resolves its includes glob patterns
// to explicit paths (single level, no recursion), and returns the expanded
// config as YAML bytes. All other settings (on_error_command, virtual_files,
// top-level name/dedup_file/source_dir) are preserved unchanged. The included
// files themselves are not modified — they can still contain their own globs.
func ExpandConfigFile(configPath string) ([]byte, error) {
realPath, _, cf, err := openConfigFile(configPath)
if err != nil {
return nil, err
}
if err := validateConfigFields(realPath, cf); err != nil {
return nil, err
}
// Validate on_error_command (same check as ResolveConfigs) so that
// expand-config fails fast on invalid input.
if cf.OnErrorCommand != nil && len(cf.OnErrorCommand.Command.Args) == 0 {
return nil, fmt.Errorf("%s: on_error_command.command must not be empty", realPath)
}
// If there are no includes, marshal the parsed config (not raw data) to
// ensure consistent output formatting and avoid accumulating headers when
// expand-config is run on already-expanded output.
if len(cf.Includes) == 0 {
return yaml.Marshal(cf)
}
// Resolve each include glob pattern to explicit paths (single level only).
configDir := filepath.Dir(realPath)
seen := make(map[string]bool)
var resolved []string
for _, pattern := range cf.Includes {
pattern = resolveRelative(configDir, pattern)
matches, err := doublestar.FilepathGlob(pattern)
if err != nil {
return nil, fmt.Errorf("expand include pattern %q in %s: %w", pattern, realPath, err)
}
sort.Strings(matches)
for _, match := range matches {
abs, err := filepath.Abs(match)
if err != nil {
return nil, fmt.Errorf("resolve path %s: %w", match, err)
}
// Canonicalize via EvalSymlinks for dedup, matching walkConfig's behavior.
canon := abs
if real, err := filepath.EvalSymlinks(abs); err == nil {
canon = real
}
if !seen[canon] {
seen[canon] = true
resolved = append(resolved, canon)
}
}
}
// Replace includes with the resolved explicit paths (sorted globally).
sort.Strings(resolved)
cf.Includes = resolved
out, err := yaml.Marshal(cf)
if err != nil {
return nil, fmt.Errorf("marshal expanded config: %w", err)
}
return out, nil
}
// Package dedup provides reading and writing of .mkvdup deduplication files.
package dedup
import (
"encoding/binary"
"github.com/stuckj/mkvdup/internal/matcher"
"github.com/stuckj/mkvdup/internal/source"
)
// File format constants
const (
Magic = "MKVDUP01"
Version = 3 // v3: Source field expanded to uint16 for >256 source files
// VersionRangeMap is the version for files with embedded range maps.
// Entries use ES offsets; a range map section maps ES offsets to raw file offsets.
VersionRangeMap uint32 = 4
// VersionCreator is V3 with a creator version string after the header.
VersionCreator uint32 = 5
// VersionRangeMapCreator is V4 with a creator version string after the header.
VersionRangeMapCreator uint32 = 6
// VersionUsed is V7: V5 with a per-source-file Used byte after the checksum.
VersionUsed uint32 = 7
// VersionRangeMapUsed is V8: V6 with a per-source-file Used byte after the checksum.
VersionRangeMapUsed uint32 = 8
// HeaderSize = Magic(8) + Version(4) + Flags(4) + OriginalSize(8) + OriginalChecksum(8) +
// SourceType(1) + UsesESOffsets(1) + SourceFileCount(2) + EntryCount(8) +
// DeltaOffset(8) + DeltaSize(8) = 60 bytes
HeaderSize = 60
EntrySize = 28 // Fixed entry size: 8+8+2+8+1+1 = 28 bytes
FooterSize = 24
FooterV4Size = 32 // V4 footer adds RangeMapChecksum (8 bytes)
MagicSize = 8
VersionSize = 4
MaxCreatorVersionLen = 4096 // Max bytes for creator version string (writer truncates, reader rejects)
)
// Source types
const (
SourceTypeDVD uint8 = 0
SourceTypeBluray uint8 = 1
)
// Header represents the fixed header at the start of a .mkvdup file.
type Header struct {
Magic [8]byte // "MKVDUP01"
Version uint32 // File format version
Flags uint32 // Reserved for future use
OriginalSize int64 // Size of original MKV file
OriginalChecksum uint64 // xxhash of original MKV file
SourceType uint8 // 0=DVD, 1=Blu-ray
UsesESOffsets uint8 // 1 if source uses ES offsets (MPEG-PS)
SourceFileCount uint16 // Number of source files
EntryCount uint64 // Number of index entries
DeltaOffset int64 // Offset to delta section
DeltaSize int64 // Size of delta section
}
// SourceFile represents a source file entry in the dedup file.
type SourceFile struct {
RelativePath string // Path relative to source directory
Size int64 // File size
Checksum uint64 // xxhash of file
Used bool // Whether this source file is referenced by any entry (V7/V8 only)
}
// Entry represents an index entry in the dedup file.
// This mirrors matcher.Entry but is specifically for serialization.
type Entry struct {
MkvOffset int64 // Start offset in the MKV file
Length int64 // Length of this region
Source uint16 // 0 = delta, 1+ = source file index + 1 (supports up to 65535 files)
SourceOffset int64 // Offset in source file (or ES offset)
IsVideo bool // For ES-based sources
AudioSubStreamID byte // For ES-based audio sub-streams
IsLPCM bool // True if 16-bit LPCM audio requiring byte-swap on read
}
// RawEntry matches the 28-byte on-disk entry format exactly.
// Uses byte arrays for int64 fields to handle unaligned access portably.
// This enables direct memory-mapped access without parsing into []Entry.
type RawEntry struct {
MkvOffset [8]byte // int64, little-endian
Length [8]byte // int64, little-endian
Source [2]byte // uint16, little-endian
SourceOffset [8]byte // int64, little-endian (unaligned at byte 18)
ESFlags uint8 // bit 0 = IsVideo
AudioSubStreamID uint8
}
// ESFlags bit layout:
//
// bit 0: IsVideo
// bit 1: IsLPCM (16-bit LPCM requiring byte-swap on read)
// bits 2-7: reserved
// ToEntry converts a RawEntry to an Entry by parsing the byte arrays.
func (r *RawEntry) ToEntry() Entry {
e := Entry{
MkvOffset: int64(binary.LittleEndian.Uint64(r.MkvOffset[:])),
Length: int64(binary.LittleEndian.Uint64(r.Length[:])),
Source: binary.LittleEndian.Uint16(r.Source[:]),
SourceOffset: int64(binary.LittleEndian.Uint64(r.SourceOffset[:])),
IsVideo: r.ESFlags&1 == 1,
AudioSubStreamID: r.AudioSubStreamID,
IsLPCM: r.ESFlags&2 != 0,
}
return e
}
// Footer represents the footer at the end of a .mkvdup file.
type Footer struct {
IndexChecksum uint64 // xxhash of index section
DeltaChecksum uint64 // xxhash of delta section
RangeMapChecksum uint64 // xxhash of range map section (V4 only; 0 for V3)
Magic [8]byte // "MKVDUP01" (for reverse scanning)
}
// File represents a complete dedup file structure for reconstruction.
// Note: Entries are accessed directly from mmap via Reader.getEntry(),
// not stored in this struct, to avoid large memory allocation.
type File struct {
Header Header
SourceFiles []SourceFile
DeltaOffset int64 // Offset to delta section in file
UsesESOffsets bool
CreatorVersion string // Version of mkvdup that created this file (V5+ only)
headerSize int64 // Effective header size (60 for V3/V4, 60+2+len for V5-V8)
}
// creatorVersionSize returns the on-disk size of the creator version field.
func creatorVersionSize(v string) int64 {
if v == "" {
return 0
}
return 2 + int64(len(v))
}
// ToMatcherEntry converts a dedup Entry to a matcher Entry.
func (e *Entry) ToMatcherEntry() matcher.Entry {
return matcher.Entry{
MkvOffset: e.MkvOffset,
Length: e.Length,
Source: e.Source,
SourceOffset: e.SourceOffset,
IsVideo: e.IsVideo,
AudioSubStreamID: e.AudioSubStreamID,
IsLPCM: e.IsLPCM,
}
}
// FromMatcherEntry creates a dedup Entry from a matcher Entry.
func FromMatcherEntry(e matcher.Entry) Entry {
return Entry{
MkvOffset: e.MkvOffset,
Length: e.Length,
Source: e.Source,
SourceOffset: e.SourceOffset,
IsVideo: e.IsVideo,
AudioSubStreamID: e.AudioSubStreamID,
IsLPCM: e.IsLPCM,
}
}
// ToSourceFile converts source.File to dedup SourceFile.
func ToSourceFile(sf source.File) SourceFile {
return SourceFile{
RelativePath: sf.RelativePath,
Size: sf.Size,
Checksum: sf.Checksum,
}
}
package dedup
import (
"bytes"
"encoding/binary"
"fmt"
"io"
"sort"
"sync"
"github.com/cespare/xxhash/v2"
"github.com/stuckj/mkvdup/internal/mmap"
"github.com/stuckj/mkvdup/internal/source"
)
// Range map constants
const (
RangeMapMagic = "RNGEMAPX" // 8 bytes
// rangeMapCoarseStep is how many entries per coarse index slot.
// Binary search the coarse index, then seek within a block.
rangeMapCoarseStep = 1024
)
// RangeMapStreamHeader identifies a stream within the range map section.
type RangeMapStreamHeader struct {
FileIndex uint16 // Source file index (0-based)
StreamType uint8 // 0 = video, 1 = audio
SubStreamID uint8 // For audio: sub-stream ID
EntryCount uint32 // Number of range entries for this stream
}
// rangeMapCoarseEntry is one slot in the coarse ESOffset index.
type rangeMapCoarseEntry struct {
esOffset int64 // ES offset at the start of this entry
fileOffset int64 // raw file offset of this entry
entryIndex int // logical entry index
entrySize int // payload size of this entry
byteOff int // byte offset in compressed data for next decode
rleRemaining int // default entries remaining after this one in current RLE run
}
// rangeMapCursor tracks position during sequential access through a compressed range map.
type rangeMapCursor struct {
esOff int64
fileOff int64
size int
rleRem int
pos int // position in compressed data
}
// StreamRangeMap provides random access to a stream's range map using
// compressed delta+varint+RLE encoded data and a coarse in-memory ESOffset index.
type StreamRangeMap struct {
compressedData []byte // compressed range data (zero-copy slice from mmap)
entryCount int
defaultGap int64
defaultSize int
coarse []rangeMapCoarseEntry // coarse ESOffset index for binary search
totalSize int64 // total ES size (sum of all entry sizes)
// Sequential read cursor cache — avoids redundant binary search + seeking
// for reads at or near the previous position (common in FUSE sequential reads).
// Protected by cursorMu for concurrent FUSE read safety.
cursorMu sync.Mutex
cachedCursor rangeMapCursor
cachedCursorValid bool
}
// TotalESSize returns the total size of the elementary stream.
func (sm *StreamRangeMap) TotalESSize() int64 {
return sm.totalSize
}
// --- Varint / Zigzag helpers ---
func zigzagEncode(v int64) uint64 {
return uint64((v << 1) ^ (v >> 63))
}
func zigzagDecode(v uint64) int64 {
return int64(v>>1) ^ -int64(v&1)
}
// --- Compressed encoding ---
// findDefaultsSampleSize is the maximum number of entries to examine when
// determining the most common gap and size. For typical media streams the
// pattern is consistent throughout, so a small sample is sufficient and
// avoids O(N) map operations on streams with hundreds of millions of entries.
const findDefaultsSampleSize = 10000
// findDefaults finds the most common gap and size in a range sequence.
// Uses sampling for large inputs to avoid expensive map operations.
// Returns (0, 0) if ranges are too small or values don't fit in uint16.
func findDefaults(ranges []source.PESPayloadRange) (defaultGap int64, defaultSize int) {
if len(ranges) < 2 {
if len(ranges) == 1 {
return 0, ranges[0].Size
}
return 0, 0
}
// Sample a prefix — patterns in PES streams are consistent throughout.
sampleLen := len(ranges)
if sampleLen > findDefaultsSampleSize {
sampleLen = findDefaultsSampleSize
}
// Count gap frequencies (sample only)
gapCounts := make(map[int64]int)
for i := 1; i < sampleLen; i++ {
prevEnd := ranges[i-1].FileOffset + int64(ranges[i-1].Size)
gap := ranges[i].FileOffset - prevEnd
gapCounts[gap]++
}
var bestGap int64
bestGapCount := 0
for gap, count := range gapCounts {
if count > bestGapCount {
bestGap = gap
bestGapCount = count
}
}
// Count size frequencies (sample only)
sizeCounts := make(map[int]int)
for i := 0; i < sampleLen; i++ {
sizeCounts[ranges[i].Size]++
}
var bestSize int
bestSizeCount := 0
for size, count := range sizeCounts {
if count > bestSizeCount {
bestSize = size
bestSizeCount = count
}
}
// Clamp to uint16 range for on-disk storage; disable RLE if out of range
if bestGap < 0 || bestGap > 65535 || bestSize > 65535 {
return 0, 0
}
return bestGap, bestSize
}
// encodeCompressedRanges encodes PES payload ranges using delta+varint+RLE.
//
// Format:
// - First entry: fileOffset (uvarint) + size (uvarint)
// - Subsequent entries:
// - 0x00 + count (uvarint): RLE run of count default entries
// - (zigzag(delta)+1) (uvarint) + size (uvarint): explicit entry
//
// The +1 shift ensures explicit entries never start with 0x00.
// offsetFunc, if non-nil, converts parser-relative FileOffset values to
// source-file-relative offsets (e.g., adding ISO base offset). The encoded
// data always stores converted offsets.
func encodeCompressedRanges(ranges []source.PESPayloadRange, defaultGap int64, defaultSize int, offsetFunc func(int64) int64) []byte {
if len(ranges) == 0 {
return nil
}
// Use direct []byte append instead of bytes.Buffer to minimize overhead
// in the hot loop. Initial capacity is generous for the header; the bulk
// of the data compresses to very few bytes via RLE.
out := make([]byte, 0, 256)
var varintBuf [binary.MaxVarintLen64]byte
// Resolve first entry's offset (apply conversion if needed)
prevOff := ranges[0].FileOffset
if offsetFunc != nil {
prevOff = offsetFunc(prevOff)
}
// First entry: always explicit
n := binary.PutUvarint(varintBuf[:], uint64(prevOff))
out = append(out, varintBuf[:n]...)
n = binary.PutUvarint(varintBuf[:], uint64(ranges[0].Size))
out = append(out, varintBuf[:n]...)
// Subsequent entries: RLE or explicit
rleCount := 0
for i := 1; i < len(ranges); i++ {
curOff := ranges[i].FileOffset
if offsetFunc != nil {
curOff = offsetFunc(curOff)
}
prevEnd := prevOff + int64(ranges[i-1].Size)
gap := curOff - prevEnd
if gap == defaultGap && ranges[i].Size == defaultSize {
rleCount++
} else {
// Flush pending RLE
if rleCount > 0 {
out = append(out, 0x00)
n := binary.PutUvarint(varintBuf[:], uint64(rleCount))
out = append(out, varintBuf[:n]...)
rleCount = 0
}
predicted := prevEnd + defaultGap
delta := curOff - predicted
encoded := zigzagEncode(delta) + 1
n := binary.PutUvarint(varintBuf[:], encoded)
out = append(out, varintBuf[:n]...)
n = binary.PutUvarint(varintBuf[:], uint64(ranges[i].Size))
out = append(out, varintBuf[:n]...)
}
prevOff = curOff
}
// Flush final RLE
if rleCount > 0 {
out = append(out, 0x00)
n := binary.PutUvarint(varintBuf[:], uint64(rleCount))
out = append(out, varintBuf[:n]...)
}
return out
}
// --- Compressed decoding ---
// buildStreamRangeMap creates a StreamRangeMap from compressed data.
// It decodes the entire stream once to build a coarse ESOffset index.
func buildStreamRangeMap(compressedData []byte, entryCount int, defaultGap int64, defaultSize int) (*StreamRangeMap, error) {
if entryCount == 0 {
return &StreamRangeMap{entryCount: 0}, nil
}
sm := &StreamRangeMap{
compressedData: compressedData,
entryCount: entryCount,
defaultGap: defaultGap,
defaultSize: defaultSize,
}
// Build coarse index by iterating through all entries
coarseCount := (entryCount + rangeMapCoarseStep - 1) / rangeMapCoarseStep
sm.coarse = make([]rangeMapCoarseEntry, 0, coarseCount)
// Decode first entry
pos := 0
fo, n := binary.Uvarint(compressedData[pos:])
if n <= 0 {
return nil, fmt.Errorf("truncated first entry fileOffset")
}
pos += n
sz, n := binary.Uvarint(compressedData[pos:])
if n <= 0 {
return nil, fmt.Errorf("truncated first entry size")
}
pos += n
var esOff int64
fileOff := int64(fo)
entSize := int(sz)
rleRem := 0
// Record coarse entry for entry 0
sm.coarse = append(sm.coarse, rangeMapCoarseEntry{
esOffset: 0, fileOffset: fileOff, entryIndex: 0, entrySize: entSize,
byteOff: pos, rleRemaining: 0,
})
// Iterate through entries 1..entryCount-1
for i := 1; i < entryCount; i++ {
prevEnd := fileOff + int64(entSize)
esOff += int64(entSize)
if rleRem > 0 {
// Still in RLE run
fileOff = prevEnd + defaultGap
entSize = defaultSize
rleRem--
} else if pos < len(compressedData) && compressedData[pos] == 0x00 {
// RLE token
pos++
count, n := binary.Uvarint(compressedData[pos:])
if n <= 0 {
return nil, fmt.Errorf("truncated RLE count at entry %d", i)
}
pos += n
fileOff = prevEnd + defaultGap
entSize = defaultSize
rleRem = int(count) - 1
} else if pos < len(compressedData) {
// Explicit entry
encoded, n := binary.Uvarint(compressedData[pos:])
if n <= 0 {
return nil, fmt.Errorf("truncated explicit delta at entry %d", i)
}
pos += n
szv, n := binary.Uvarint(compressedData[pos:])
if n <= 0 {
return nil, fmt.Errorf("truncated explicit size at entry %d", i)
}
pos += n
delta := zigzagDecode(encoded - 1)
fileOff = prevEnd + defaultGap + delta
entSize = int(szv)
rleRem = 0
} else {
return nil, fmt.Errorf("unexpected end of compressed data at entry %d", i)
}
if i%rangeMapCoarseStep == 0 {
sm.coarse = append(sm.coarse, rangeMapCoarseEntry{
esOffset: esOff, fileOffset: fileOff, entryIndex: i, entrySize: entSize,
byteOff: pos, rleRemaining: rleRem,
})
}
}
sm.totalSize = esOff + int64(entSize)
return sm, nil
}
// advanceCursor moves the cursor forward by one entry.
func (sm *StreamRangeMap) advanceCursor(c *rangeMapCursor) error {
prevEnd := c.fileOff + int64(c.size)
c.esOff += int64(c.size)
if c.rleRem > 0 {
c.fileOff = prevEnd + sm.defaultGap
c.size = sm.defaultSize
c.rleRem--
return nil
}
if c.pos >= len(sm.compressedData) {
return fmt.Errorf("unexpected end of compressed data")
}
if sm.compressedData[c.pos] == 0x00 {
c.pos++
count, n := binary.Uvarint(sm.compressedData[c.pos:])
if n <= 0 {
return fmt.Errorf("truncated RLE count")
}
c.pos += n
c.fileOff = prevEnd + sm.defaultGap
c.size = sm.defaultSize
c.rleRem = int(count) - 1
} else {
encoded, n := binary.Uvarint(sm.compressedData[c.pos:])
if n <= 0 {
return fmt.Errorf("truncated explicit delta")
}
c.pos += n
szv, n := binary.Uvarint(sm.compressedData[c.pos:])
if n <= 0 {
return fmt.Errorf("truncated explicit size")
}
c.pos += n
delta := zigzagDecode(encoded - 1)
c.fileOff = prevEnd + sm.defaultGap + delta
c.size = int(szv)
c.rleRem = 0
}
return nil
}
// seekTo positions a cursor at the entry containing esOffset.
// Uses the cached cursor for O(1) sequential reads, falling back to
// coarse index binary search + seek for random access.
func (sm *StreamRangeMap) seekTo(esOffset int64) (rangeMapCursor, error) {
// Fast path: check if cached cursor is at or before the target.
// Lock to get a consistent snapshot of the cached cursor.
sm.cursorMu.Lock()
cachedValid := sm.cachedCursorValid
var cc rangeMapCursor
if cachedValid {
cc = sm.cachedCursor // Copy while holding lock
}
sm.cursorMu.Unlock()
if cachedValid {
curEnd := cc.esOff + int64(cc.size)
if esOffset >= cc.esOff && esOffset < curEnd {
// Target is within the cached entry — use directly
return cc, nil
}
if esOffset >= curEnd {
// Target is ahead of cached cursor — try seeking forward.
// Only use this path if the target is "close" (within ~2 coarse blocks).
maxForwardSeek := int64(rangeMapCoarseStep*2) * int64(sm.defaultSize+1)
if maxForwardSeek > 0 && esOffset-curEnd < maxForwardSeek {
cur := cc
for cur.esOff+int64(cur.size) <= esOffset {
// RLE fast path: use arithmetic to skip directly to the target entry
// instead of advancing one-by-one through the RLE run.
if cur.rleRem > 0 && sm.defaultSize > 0 {
afterCurrent := cur.esOff + int64(cur.size)
maxRLEES := afterCurrent + int64(cur.rleRem)*int64(sm.defaultSize)
if esOffset < maxRLEES {
// k = entries to skip (1-based). k-1 in offset calc positions
// relative to afterCurrent (the start of entry 1 in the run).
k := int((esOffset-afterCurrent)/int64(sm.defaultSize)) + 1
if k > cur.rleRem {
k = cur.rleRem
}
stride := int64(sm.defaultSize) + sm.defaultGap
cur.esOff = afterCurrent + int64(k-1)*int64(sm.defaultSize)
cur.fileOff = cur.fileOff + int64(cur.size) + sm.defaultGap + int64(k-1)*stride
cur.size = sm.defaultSize
cur.rleRem -= k
continue
}
k := cur.rleRem
stride := int64(sm.defaultSize) + sm.defaultGap
cur.esOff = afterCurrent + int64(k-1)*int64(sm.defaultSize)
cur.fileOff = cur.fileOff + int64(cur.size) + sm.defaultGap + int64(k-1)*stride
cur.size = sm.defaultSize
cur.rleRem = 0
continue
}
if err := sm.advanceCursor(&cur); err != nil {
return rangeMapCursor{}, fmt.Errorf("seek to ES offset %d: %w", esOffset, err)
}
}
return cur, nil
}
}
}
// Slow path: binary search the coarse index
blockIdx := sort.Search(len(sm.coarse), func(i int) bool {
return sm.coarse[i].esOffset > esOffset
}) - 1
if blockIdx < 0 {
blockIdx = 0
}
ce := &sm.coarse[blockIdx]
cur := rangeMapCursor{
esOff: ce.esOffset,
fileOff: ce.fileOffset,
size: ce.entrySize,
rleRem: ce.rleRemaining,
pos: ce.byteOff,
}
for cur.esOff+int64(cur.size) <= esOffset {
if cur.rleRem > 0 && sm.defaultSize > 0 {
afterCurrent := cur.esOff + int64(cur.size)
maxRLEES := afterCurrent + int64(cur.rleRem)*int64(sm.defaultSize)
if esOffset < maxRLEES {
k := int((esOffset-afterCurrent)/int64(sm.defaultSize)) + 1
if k > cur.rleRem {
k = cur.rleRem
}
stride := int64(sm.defaultSize) + sm.defaultGap
cur.esOff = afterCurrent + int64(k-1)*int64(sm.defaultSize)
cur.fileOff = cur.fileOff + int64(cur.size) + sm.defaultGap + int64(k-1)*stride
cur.size = sm.defaultSize
cur.rleRem -= k
continue
}
k := cur.rleRem
stride := int64(sm.defaultSize) + sm.defaultGap
cur.esOff = afterCurrent + int64(k-1)*int64(sm.defaultSize)
cur.fileOff = cur.fileOff + int64(cur.size) + sm.defaultGap + int64(k-1)*stride
cur.size = sm.defaultSize
cur.rleRem = 0
continue
}
if err := sm.advanceCursor(&cur); err != nil {
return rangeMapCursor{}, fmt.Errorf("seek to ES offset %d: %w", esOffset, err)
}
}
return cur, nil
}
// ReadData reads ES data at the given offset, copying into a new buffer.
// Uses the coarse index for fast binary search, RLE arithmetic for fast seeking.
func (sm *StreamRangeMap) ReadData(sourceData []byte, sourceSize int64, esOffset int64, size int) ([]byte, error) {
if sm.entryCount == 0 {
return nil, fmt.Errorf("empty range map")
}
cur, err := sm.seekTo(esOffset)
if err != nil {
return nil, err
}
// Read data, potentially spanning multiple entries
result := make([]byte, 0, size)
remaining := size
for remaining > 0 {
offsetInEntry := esOffset - cur.esOff
if offsetInEntry < 0 {
return nil, fmt.Errorf("ES offset gap at ES %d", cur.esOff)
}
available := int64(cur.size) - offsetInEntry
toRead := int64(remaining)
if toRead > available {
toRead = available
}
srcStart := cur.fileOff + offsetInEntry
srcEnd := srcStart + toRead
if srcEnd > sourceSize {
return nil, fmt.Errorf("source read out of bounds: %d + %d > %d", srcStart, toRead, sourceSize)
}
result = append(result, sourceData[srcStart:srcEnd]...)
remaining -= int(toRead)
esOffset += toRead
if remaining > 0 {
// RLE batch path: batch-copy full entries using stride arithmetic.
if cur.rleRem > 0 {
cur.esOff += int64(cur.size)
cur.fileOff += int64(cur.size) + sm.defaultGap
cur.size = sm.defaultSize
cur.rleRem--
stride := int64(sm.defaultSize) + sm.defaultGap
defSz := sm.defaultSize
defSz64 := int64(defSz)
// Calculate how many full entries we can batch-copy
batchCount := remaining / defSz
if maxRLE := cur.rleRem + 1; batchCount > maxRLE {
batchCount = maxRLE
}
if batchCount > 0 {
lastSrcEnd := cur.fileOff + int64(batchCount-1)*stride + defSz64
if lastSrcEnd > sourceSize {
return nil, fmt.Errorf("source read out of bounds: %d > %d",
lastSrcEnd, sourceSize)
}
off := len(result)
result = result[:off+batchCount*defSz]
stridedCopy(
result[off:off+batchCount*defSz],
sourceData[cur.fileOff:lastSrcEnd],
batchCount, defSz, int(stride),
)
copied := batchCount * defSz
remaining -= copied
esOffset += int64(copied)
if batchCount > 1 {
advance := batchCount - 1
cur.esOff += int64(advance) * defSz64
cur.fileOff += int64(advance) * stride
cur.rleRem -= advance
}
}
continue
}
if err := sm.advanceCursor(&cur); err != nil {
return nil, fmt.Errorf("read spanning entries: %w", err)
}
}
}
// Update cached cursor for next sequential read
sm.cursorMu.Lock()
sm.cachedCursor = cur
sm.cachedCursorValid = true
sm.cursorMu.Unlock()
return result, nil
}
// ReadDataInto reads ES data at the given offset directly into dest, avoiding allocation.
// Returns the number of bytes written. Uses cached cursor for sequential reads.
//
// The source parameter provides read access to the source file. If source
// implements MmapData, the mmap'd byte slice is used for zero-copy reads.
// Otherwise, source.ReadAt is used (pread path for network filesystems).
func (sm *StreamRangeMap) ReadDataInto(source mmap.SourceFile, esOffset int64, dest []byte) (int, error) {
if sm.entryCount == 0 {
return 0, fmt.Errorf("empty range map")
}
sourceSize := source.Size()
// Resolve mmap data once for the zero-copy fast path.
var sourceData []byte
if md, ok := source.(mmap.MmapData); ok {
sourceData = md.Data()
}
cur, err := sm.seekTo(esOffset)
if err != nil {
return 0, err
}
// Read data directly into dest, potentially spanning multiple entries
written := 0
remaining := len(dest)
for remaining > 0 {
offsetInEntry := esOffset - cur.esOff
if offsetInEntry < 0 {
return written, fmt.Errorf("ES offset gap at ES %d", cur.esOff)
}
available := int64(cur.size) - offsetInEntry
toRead := int64(remaining)
if toRead > available {
toRead = available
}
srcStart := cur.fileOff + offsetInEntry
srcEnd := srcStart + toRead
if srcEnd > sourceSize {
return written, fmt.Errorf("source read out of bounds: %d + %d > %d", srcStart, toRead, sourceSize)
}
if sourceData != nil {
copy(dest[written:], sourceData[srcStart:srcEnd])
} else {
if n, err := source.ReadAt(dest[written:written+int(toRead)], srcStart); err != nil && !(n == int(toRead) && err == io.EOF) {
return written, fmt.Errorf("pread at %d: %w", srcStart, err)
}
}
written += int(toRead)
remaining -= int(toRead)
esOffset += toRead
if remaining > 0 {
// RLE batch path: when the next entries are in an RLE run,
// batch-copy full entries using a single strided copy instead of
// calling copy()/advanceCursor per entry.
if cur.rleRem > 0 {
// Advance to next RLE entry (equivalent to one advanceCursor)
cur.esOff += int64(cur.size)
cur.fileOff += int64(cur.size) + sm.defaultGap
cur.size = sm.defaultSize
cur.rleRem--
stride := int64(sm.defaultSize) + sm.defaultGap
defSz := sm.defaultSize
defSz64 := int64(defSz)
// Calculate how many full entries we can batch-copy
batchCount := remaining / defSz
if maxRLE := cur.rleRem + 1; batchCount > maxRLE {
batchCount = maxRLE
}
if batchCount > 0 {
// Bounds check the entire batch
lastSrcEnd := cur.fileOff + int64(batchCount-1)*stride + defSz64
if lastSrcEnd > sourceSize {
return written, fmt.Errorf("source read out of bounds: %d > %d",
lastSrcEnd, sourceSize)
}
if sourceData != nil {
stridedCopy(
dest[written:written+batchCount*defSz],
sourceData[cur.fileOff:lastSrcEnd],
batchCount, defSz, int(stride),
)
} else {
// Pread path: read the contiguous source region into a
// temp buffer, then strided-copy into dest.
// tmpSize is bounded by ~len(dest) * stride/defSz, which
// for Blu-ray M2TS (192/188) is ≈1.02× the dest buffer.
// Since dest comes from a FUSE read (typically 128KB, max
// ~1MB), this allocation is small and short-lived. If
// profiling shows GC pressure, consider a sync.Pool here.
tmpSize := int(lastSrcEnd - cur.fileOff)
tmp := make([]byte, tmpSize)
if n, err := source.ReadAt(tmp, cur.fileOff); err != nil && !(n == tmpSize && err == io.EOF) {
return written, fmt.Errorf("pread batch at %d: %w", cur.fileOff, err)
}
stridedCopy(
dest[written:written+batchCount*defSz],
tmp,
batchCount, defSz, int(stride),
)
}
copied := batchCount * defSz
written += copied
remaining -= copied
esOffset += int64(copied)
// Position cursor at the last copied entry
if batchCount > 1 {
advance := batchCount - 1
cur.esOff += int64(advance) * defSz64
cur.fileOff += int64(advance) * stride
cur.rleRem -= advance
}
}
continue
}
if err := sm.advanceCursor(&cur); err != nil {
return written, fmt.Errorf("read spanning entries: %w", err)
}
}
}
// Update cached cursor for next sequential read
sm.cursorMu.Lock()
sm.cachedCursor = cur
sm.cachedCursorValid = true
sm.cursorMu.Unlock()
return written, nil
}
// --- Deserialization (for Reader) ---
// SourceRangeMaps holds parsed range maps for one source file.
type SourceRangeMaps struct {
FileIndex uint16
VideoMap *StreamRangeMap
AudioMaps map[byte]*StreamRangeMap // keyed by sub-stream ID
}
// readRangeMapSection parses the range map section from mmap'd data.
// The data slice should point to the start of the range map section.
// Compressed data is zero-copy sliced from the input.
func readRangeMapSection(data []byte) ([]SourceRangeMaps, error) {
if len(data) < 10 { // magic (8) + source count (2)
return nil, fmt.Errorf("range map section too small: %d bytes", len(data))
}
// Verify magic
if string(data[:8]) != RangeMapMagic {
return nil, fmt.Errorf("invalid range map magic: %q", data[:8])
}
off := 8
// Source count
sourceCount := int(binary.LittleEndian.Uint16(data[off : off+2]))
off += 2
result := make([]SourceRangeMaps, 0, sourceCount)
for s := 0; s < sourceCount; s++ {
if off+3 > len(data) { // FileIndex (2) + StreamCount (1)
return nil, fmt.Errorf("truncated range map at source %d", s)
}
fileIndex := binary.LittleEndian.Uint16(data[off : off+2])
off += 2
streamCount := int(data[off])
off++
src := SourceRangeMaps{
FileIndex: fileIndex,
AudioMaps: make(map[byte]*StreamRangeMap),
}
for st := 0; st < streamCount; st++ {
if off+8 > len(data) { // StreamHeader size
return nil, fmt.Errorf("truncated stream header at source %d stream %d", s, st)
}
// Parse stream header
var hdr RangeMapStreamHeader
_ = binary.LittleEndian.Uint16(data[off : off+2]) // per-stream FileIndex (already tracked per source)
hdr.StreamType = data[off+2]
hdr.SubStreamID = data[off+3]
hdr.EntryCount = binary.LittleEndian.Uint32(data[off+4 : off+8])
off += 8
// Read compression parameters
if off+8 > len(data) { // DefaultGap(2) + DefaultSize(2) + CompressedDataSize(4)
return nil, fmt.Errorf("truncated compression params at source %d stream %d", s, st)
}
defGap := int64(binary.LittleEndian.Uint16(data[off : off+2]))
off += 2
defSize := int(binary.LittleEndian.Uint16(data[off : off+2]))
off += 2
compSize := int(binary.LittleEndian.Uint32(data[off : off+4]))
off += 4
if off+compSize > len(data) {
return nil, fmt.Errorf("truncated compressed data at source %d stream %d: need %d bytes at offset %d, have %d total",
s, st, compSize, off, len(data))
}
// Zero-copy slice into mmap'd data
compData := data[off : off+compSize]
off += compSize
sm, err := buildStreamRangeMap(compData, int(hdr.EntryCount), defGap, defSize)
if err != nil {
return nil, fmt.Errorf("build range map for source %d stream %d: %w", s, st, err)
}
if hdr.StreamType == 0 {
src.VideoMap = sm
} else {
src.AudioMaps[hdr.SubStreamID] = sm
}
}
result = append(result, src)
}
return result, nil
}
// --- Serialization (for Writer) ---
// RangeMapData holds the range map data for all streams of one source file,
// ready for serialization into the dedup file.
type RangeMapData struct {
FileIndex uint16
VideoRanges []source.PESPayloadRange
AudioStreams []AudioRangeData
OffsetFunc func(int64) int64 // optional: converts parser-relative to source-file-relative FileOffset
}
// AudioRangeData holds range data for one audio sub-stream.
type AudioRangeData struct {
SubStreamID byte
Ranges []source.PESPayloadRange
}
// encodeRangeMapSection encodes the entire range map section to a byte buffer.
// This is called before writing to determine the exact size and compute the checksum.
func encodeRangeMapSection(rangeMaps []RangeMapData) ([]byte, error) {
var buf bytes.Buffer
// Magic
buf.Write([]byte(RangeMapMagic))
// Source count
var tmp [8]byte
binary.LittleEndian.PutUint16(tmp[:2], uint16(len(rangeMaps)))
buf.Write(tmp[:2])
// Write each source's stream range maps
for _, rm := range rangeMaps {
// Count streams
streamCount := uint8(0)
if len(rm.VideoRanges) > 0 {
streamCount++
}
streamCount += uint8(len(rm.AudioStreams))
binary.LittleEndian.PutUint16(tmp[:2], rm.FileIndex)
buf.Write(tmp[:2])
buf.WriteByte(streamCount)
// Video stream
if len(rm.VideoRanges) > 0 {
writeCompressedStream(&buf, rm.FileIndex, 0, 0, rm.VideoRanges, rm.OffsetFunc)
}
// Audio streams
for _, audio := range rm.AudioStreams {
writeCompressedStream(&buf, rm.FileIndex, 1, audio.SubStreamID, audio.Ranges, rm.OffsetFunc)
}
}
return buf.Bytes(), nil
}
// writeCompressedStream writes one stream's compressed range data.
// offsetFunc, if non-nil, converts parser-relative FileOffset values to
// source-file-relative offsets during encoding.
func writeCompressedStream(buf *bytes.Buffer, fileIndex uint16, streamType uint8, subStreamID byte, ranges []source.PESPayloadRange, offsetFunc func(int64) int64) {
// Stream header: FileIndex(2) + StreamType(1) + SubStreamID(1) + EntryCount(4) = 8 bytes
var hdrBuf [16]byte
binary.LittleEndian.PutUint16(hdrBuf[0:2], fileIndex)
hdrBuf[2] = streamType
hdrBuf[3] = subStreamID
binary.LittleEndian.PutUint32(hdrBuf[4:8], uint32(len(ranges)))
buf.Write(hdrBuf[:8])
// Find defaults (parser-relative offsets — gaps are the same in both domains
// for the common non-boundary case, which dominates the mode calculation)
defGap, defSize := findDefaults(ranges)
// Compression parameters: DefaultGap(2) + DefaultSize(2) + CompressedDataSize(4) = 8 bytes
binary.LittleEndian.PutUint16(hdrBuf[0:2], uint16(defGap))
binary.LittleEndian.PutUint16(hdrBuf[2:4], uint16(defSize))
// Encode compressed ranges (applies offsetFunc during encoding)
compressed := encodeCompressedRanges(ranges, defGap, defSize, offsetFunc)
binary.LittleEndian.PutUint32(hdrBuf[4:8], uint32(len(compressed)))
buf.Write(hdrBuf[:8])
buf.Write(compressed)
}
// writeRangeMapSection writes a pre-encoded range map buffer and returns its checksum.
// Used by the writer to write the range map section to the dedup file.
func writeRangeMapSection(w io.Writer, rangeMapBuf []byte) (uint64, error) {
hasher := xxhash.New()
hasher.Write(rangeMapBuf)
if _, err := w.Write(rangeMapBuf); err != nil {
return 0, err
}
return hasher.Sum64(), nil
}
package dedup
import (
"encoding/binary"
"fmt"
"io"
"os"
"sort"
"sync"
"time"
"github.com/cespare/xxhash/v2"
"github.com/stuckj/mkvdup/internal/mmap"
"github.com/stuckj/mkvdup/internal/security"
"github.com/stuckj/mkvdup/internal/source"
"golang.org/x/sys/unix"
)
// blockSize is the block size for the block index.
// Each block maps an MKV offset range to an entry index for O(1) lookup.
// 64KB balances memory overhead vs scan distance.
const blockSize = 64 * 1024
// Reader reads .mkvdup files and provides data reconstruction.
// Reader is safe for concurrent use from multiple goroutines.
type Reader struct {
file *File
dedupMmap *mmap.File
dedupPath string
sourceDir string
sourceFiles []mmap.SourceFile
esReader ESReader // For ES-based sources (v1 only, deprecated in v2)
entriesOnce sync.Once // For lazy entry access initialization
entriesErr error // Error from entry access initialization
// Direct mmap access to entries (no []Entry allocation)
indexStart int64 // Byte offset where entries begin in file
entryCount int // Number of entries
// Block index for fast entry lookup on cache miss.
// Maps block_number (MKV offset / blockSize) → entry index for O(1)
// narrowing, followed by bounded binary search within the block range.
// Built once in initEntryAccess; immutable after that (no mutex needed).
blockIndex []int
// Last-entry cache for O(1) sequential read lookup
// Protected by cacheMu for concurrent access safety
cacheMu sync.Mutex
lastEntryIdx int // Index of last accessed entry (-1 if none)
lastEntry Entry // The cached parsed entry
lastEntryValid bool // Whether lastEntry is valid
// V4 range map data (maps ES offsets to raw file offsets)
rangeMapsByFile map[int]*SourceRangeMaps // file index -> range maps
}
// ESReader interface for reading ES data from MPEG-PS sources.
type ESReader interface {
ReadESData(esOffset int64, size int, isVideo bool) ([]byte, error)
ReadAudioSubStreamData(subStreamID byte, esOffset int64, size int) ([]byte, error)
}
// NewReader opens a dedup file for reading with entry access initialized immediately.
// Use NewReaderLazy for faster initialization when entries can be accessed on first read.
func NewReader(dedupPath, sourceDir string) (*Reader, error) {
r, err := NewReaderLazy(dedupPath, sourceDir)
if err != nil {
return nil, err
}
// Force immediate entry access initialization
if err := r.initEntryAccess(); err != nil {
r.Close()
return nil, fmt.Errorf("init entry access: %w", err)
}
return r, nil
}
// NewReaderLazy opens a dedup file but only reads the header.
// Entries are loaded lazily on first Read. Use this for fast mount times with many files.
func NewReaderLazy(dedupPath, sourceDir string) (*Reader, error) {
f, err := os.Open(dedupPath)
if err != nil {
return nil, fmt.Errorf("open dedup file: %w", err)
}
defer f.Close()
file, err := parseHeaderOnly(f)
if err != nil {
return nil, fmt.Errorf("parse dedup header: %w", err)
}
// Memory-map the dedup file
dedupMmap, err := mmap.Open(dedupPath)
if err != nil {
return nil, fmt.Errorf("mmap dedup file: %w", err)
}
return &Reader{
file: file,
dedupMmap: dedupMmap,
dedupPath: dedupPath,
sourceDir: sourceDir,
lastEntryIdx: -1, // No entry cached yet
}, nil
}
// SetESReader sets the ES reader for ES-based sources.
func (r *Reader) SetESReader(esReader ESReader) {
r.esReader = esReader
}
// LoadSourceFiles memory-maps all source files.
func (r *Reader) LoadSourceFiles() error {
r.sourceFiles = make([]mmap.SourceFile, len(r.file.SourceFiles))
for i, sf := range r.file.SourceFiles {
path, err := security.CheckPathConfinement(r.sourceDir, sf.RelativePath)
if err != nil {
for j := 0; j < i; j++ {
if r.sourceFiles[j] != nil {
r.sourceFiles[j].Close()
}
}
return fmt.Errorf("source file %s: %w", sf.RelativePath, err)
}
m, err := mmap.Open(path)
if err != nil {
// Clean up already opened files
for j := 0; j < i; j++ {
if r.sourceFiles[j] != nil {
r.sourceFiles[j].Close()
}
}
return fmt.Errorf("mmap source file %s: %w", sf.RelativePath, err)
}
// Hint sequential access so the kernel does aggressive readahead
// instead of handling individual 4KB page faults.
m.Advise(unix.MADV_SEQUENTIAL)
r.sourceFiles[i] = m
}
return nil
}
// LoadSourceFilesPread opens all source files using pread(2) instead of mmap.
// This is used for source files on network filesystems where mmap is unsafe.
func (r *Reader) LoadSourceFilesPread(timeout time.Duration) error {
r.sourceFiles = make([]mmap.SourceFile, len(r.file.SourceFiles))
for i, sf := range r.file.SourceFiles {
path, err := security.CheckPathConfinement(r.sourceDir, sf.RelativePath)
if err != nil {
for j := 0; j < i; j++ {
if r.sourceFiles[j] != nil {
r.sourceFiles[j].Close()
}
}
return fmt.Errorf("source file %s: %w", sf.RelativePath, err)
}
pf, err := mmap.OpenPread(path, timeout)
if err != nil {
// Clean up already opened files
for j := 0; j < i; j++ {
if r.sourceFiles[j] != nil {
r.sourceFiles[j].Close()
}
}
return fmt.Errorf("open source file %s: %w", sf.RelativePath, err)
}
r.sourceFiles[i] = pf
}
return nil
}
// Close releases all resources.
func (r *Reader) Close() error {
if r.dedupMmap != nil {
r.dedupMmap.Close()
}
for _, sf := range r.sourceFiles {
if sf != nil {
sf.Close()
}
}
return nil
}
// initEntryAccess initializes direct mmap access to entries (no parsing into []Entry).
// This is called lazily on first entry access.
func (r *Reader) initEntryAccess() error {
r.entriesOnce.Do(func() {
// Calculate index start offset
r.indexStart = r.file.headerSize + r.calculateSourceFilesSize()
r.entryCount = int(r.file.Header.EntryCount)
// Validate mmap has enough data for all entries
requiredSize := r.indexStart + int64(r.entryCount)*EntrySize
if int64(r.dedupMmap.Size()) < requiredSize {
r.entriesErr = fmt.Errorf("mmap too small: need %d, have %d",
requiredSize, r.dedupMmap.Size())
return
}
// Build block index for fast random access lookup
r.buildBlockIndex()
// V4/V6/V8: parse range map section
if r.hasRangeMaps() {
if err := r.initRangeMaps(); err != nil {
r.entriesErr = fmt.Errorf("init range maps: %w", err)
return
}
}
})
return r.entriesErr
}
// initRangeMaps parses the range map section from the mmap'd dedup file.
func (r *Reader) initRangeMaps() error {
// Range map section is between delta and footer
rangeMapOffset := r.file.DeltaOffset + r.file.Header.DeltaSize
fileSize := r.dedupMmap.Size()
rangeMapSize := int(fileSize) - FooterV4Size - int(rangeMapOffset)
if rangeMapSize <= 0 {
return fmt.Errorf("no range map section found (offset %d, file size %d)", rangeMapOffset, fileSize)
}
data := r.dedupMmap.Slice(rangeMapOffset, rangeMapSize)
if data == nil {
return fmt.Errorf("range map slice out of bounds")
}
sources, err := readRangeMapSection(data)
if err != nil {
return fmt.Errorf("parse range map section: %w", err)
}
r.rangeMapsByFile = make(map[int]*SourceRangeMaps, len(sources))
for i := range sources {
r.rangeMapsByFile[int(sources[i].FileIndex)] = &sources[i]
}
return nil
}
// hasRangeMaps returns true if this dedup file uses range maps (V4/V6/V8).
func (r *Reader) hasRangeMaps() bool {
switch r.file.Header.Version {
case VersionRangeMap, VersionRangeMapCreator, VersionRangeMapUsed:
return true
}
return false
}
// HasRangeMaps returns true if this dedup file uses V4/V6/V8 range maps.
// This checks the header version (available immediately after NewReaderLazy)
// rather than the lazily-loaded range map data, so it's safe to call
// before the first ReadAt.
func (r *Reader) HasRangeMaps() bool {
return r.hasRangeMaps()
}
// HasSourceUsedFlags returns true if the dedup file has per-source-file Used flags (V7+).
func (r *Reader) HasSourceUsedFlags() bool {
switch r.file.Header.Version {
case VersionUsed, VersionRangeMapUsed:
return true
}
return false
}
// buildBlockIndex creates a mapping from block numbers to entry indices.
// Each block represents a fixed-size range of MKV offsets. The index maps
// block_number → the entry index whose region covers or precedes that block's
// start offset. This narrows binary search from O(log N) over all entries
// to O(log B) within a single block's entries.
//
// Algorithm: single pass over all entries, filling block slots as we go.
// Time: O(entryCount + blockCount), Space: O(blockCount).
func (r *Reader) buildBlockIndex() {
originalSize := r.file.Header.OriginalSize
if originalSize <= 0 || r.entryCount == 0 {
return
}
blockCount := int((originalSize + blockSize - 1) / blockSize)
r.blockIndex = make([]int, blockCount)
entryIdx := 0
for b := range blockCount {
blockStart := int64(b) * blockSize
// Advance entryIdx to the last entry whose MkvOffset <= blockStart.
// For block 0 (blockStart=0), this stays at 0 since no entry precedes it.
for entryIdx+1 < r.entryCount {
nextOffset, ok := r.getMkvOffset(entryIdx + 1)
if !ok || nextOffset > blockStart {
break
}
entryIdx++
}
r.blockIndex[b] = entryIdx
}
}
// getEntry returns the entry at the given index by parsing from mmap.
// Uses cache for O(1) sequential access. Safe for concurrent use.
func (r *Reader) getEntry(idx int) (Entry, bool) {
if idx < 0 || idx >= r.entryCount {
return Entry{}, false
}
// Check cache first (with lock)
r.cacheMu.Lock()
if r.lastEntryValid && r.lastEntryIdx == idx {
entry := r.lastEntry
r.cacheMu.Unlock()
return entry, true
}
r.cacheMu.Unlock()
// Parse entry from mmap using RawEntry (no lock needed - mmap is read-only)
offset := r.indexStart + int64(idx)*EntrySize
data := r.dedupMmap.Slice(offset, EntrySize)
if len(data) < EntrySize {
return Entry{}, false
}
// Parse using RawEntry for portable unaligned access
// Layout: MkvOffset(8) + Length(8) + Source(2) + SourceOffset(8) + ESFlags(1) + AudioSubStreamID(1) = 28
var raw RawEntry
copy(raw.MkvOffset[:], data[0:8])
copy(raw.Length[:], data[8:16])
copy(raw.Source[:], data[16:18])
copy(raw.SourceOffset[:], data[18:26])
raw.ESFlags = data[26]
raw.AudioSubStreamID = data[27]
entry := raw.ToEntry()
// Update cache (with lock)
r.cacheMu.Lock()
r.lastEntryIdx = idx
r.lastEntry = entry
r.lastEntryValid = true
r.cacheMu.Unlock()
return entry, true
}
// getMkvOffset returns just the MkvOffset for entry at idx (for binary search).
// This avoids full entry parsing when only the offset is needed.
func (r *Reader) getMkvOffset(idx int) (int64, bool) {
if idx < 0 || idx >= r.entryCount {
return 0, false
}
offset := r.indexStart + int64(idx)*EntrySize
data := r.dedupMmap.Slice(offset, 8) // Only read MkvOffset field (first 8 bytes)
if len(data) < 8 {
return 0, false
}
return int64(binary.LittleEndian.Uint64(data)), true
}
// getEntryLength returns just the Length for entry at idx (for binary search).
// This avoids full entry parsing when only offset and length are needed.
func (r *Reader) getEntryLength(idx int) (int64, bool) {
if idx < 0 || idx >= r.entryCount {
return 0, false
}
// Length is at offset 8 within each entry (after MkvOffset)
offset := r.indexStart + int64(idx)*EntrySize + 8
data := r.dedupMmap.Slice(offset, 8)
if len(data) < 8 {
return 0, false
}
return int64(binary.LittleEndian.Uint64(data)), true
}
// OriginalSize returns the size of the original MKV file.
func (r *Reader) OriginalSize() int64 {
return r.file.Header.OriginalSize
}
// OriginalChecksum returns the checksum of the original MKV file.
func (r *Reader) OriginalChecksum() uint64 {
return r.file.Header.OriginalChecksum
}
// SourceFiles returns the list of source files.
func (r *Reader) SourceFiles() []SourceFile {
return r.file.SourceFiles
}
// EntryCount returns the number of index entries.
// Returns 0 if entry access initialization failed. Use InitEntryAccess() to check for errors.
func (r *Reader) EntryCount() int {
r.initEntryAccess() // Ensure entryCount is initialized
return r.entryCount
}
// GetEntry returns the entry at the given index.
// Returns false if the index is out of range or if entry access initialization failed.
func (r *Reader) GetEntry(idx int) (Entry, bool) {
if err := r.initEntryAccess(); err != nil {
return Entry{}, false
}
return r.getEntry(idx)
}
// InitEntryAccess explicitly initializes entry access and returns any error.
// This is useful when you need to check for initialization errors before calling
// methods like EntryCount() or Info() that silently return zero/empty on error.
func (r *Reader) InitEntryAccess() error {
return r.initEntryAccess()
}
// UsesESOffsets returns true if this dedup file uses ES offsets.
func (r *Reader) UsesESOffsets() bool {
return r.file.UsesESOffsets
}
// ReadAt reads reconstructed MKV data at the given offset.
func (r *Reader) ReadAt(buf []byte, offset int64) (int, error) {
// Initialize entry access on first read (lazy initialization)
if err := r.initEntryAccess(); err != nil {
return 0, fmt.Errorf("init entry access: %w", err)
}
if offset >= r.file.Header.OriginalSize {
return 0, io.EOF
}
totalRead := 0
remaining := len(buf)
originalOffset := offset // Preserve original offset for buffer position calculation
// Limit read to file size
if offset+int64(remaining) > r.file.Header.OriginalSize {
remaining = int(r.file.Header.OriginalSize - offset)
}
endOffset := offset + int64(remaining)
// Find starting entry index (zero-allocation inline lookup)
startIdx := r.findStartEntry(offset)
// Iterate entries directly — no []Entry allocation
for i := startIdx; i < r.entryCount && remaining > 0; i++ {
entry, ok := r.getEntry(i)
if !ok || entry.MkvOffset >= endOffset {
break
}
// Calculate overlap
entryEnd := entry.MkvOffset + entry.Length
readStart := offset
if readStart < entry.MkvOffset {
readStart = entry.MkvOffset
}
readEnd := offset + int64(remaining)
if readEnd > entryEnd {
readEnd = entryEnd
}
readLen := int(readEnd - readStart)
if readLen <= 0 {
continue
}
// Calculate offset within entry
offsetInEntry := readStart - entry.MkvOffset
sourceOffset := entry.SourceOffset + offsetInEntry
// Calculate buffer position
bufOffset := int(readStart - originalOffset)
// Check if this is an LPCM entry needing byte-swap.
// For LPCM entries, the source data is raw big-endian PCM; we must
// byte-swap 16-bit pairs aligned to the entry start. Both the start
// offset and read length may be misaligned to pair boundaries when
// the caller's buffer doesn't align with entry boundaries.
needsLPCMSwap := entry.Source != 0 && entry.IsLPCM && !(r.file.UsesESOffsets && r.esReader != nil)
if needsLPCMSwap {
// Compute pair-aligned read range within the entry.
alignedOff := offsetInEntry
trimFront := 0
if alignedOff%2 == 1 {
alignedOff--
trimFront = 1
}
alignedLen := readLen + trimFront
entryRemaining := int(entry.Length - alignedOff)
if alignedLen%2 == 1 && alignedLen < entryRemaining {
alignedLen++
}
alignedSrcOff := entry.SourceOffset + alignedOff
tmp := make([]byte, alignedLen)
if err := r.lpcmAlignedRead(entry, alignedSrcOff, tmp); err != nil {
return totalRead, fmt.Errorf("read at offset %d: %w", readStart, err)
}
source.TransformLPCM16BE(tmp)
copy(buf[bufOffset:bufOffset+readLen], tmp[trimFront:trimFront+readLen])
} else {
// Normal read path (non-LPCM)
if err := r.readEntry(entry, sourceOffset, readLen, buf[bufOffset:bufOffset+readLen]); err != nil {
return totalRead, fmt.Errorf("read at offset %d: %w", readStart, err)
}
}
totalRead += readLen
remaining -= readLen
offset = readEnd
}
if totalRead == 0 && len(buf) > 0 {
return 0, io.EOF
}
return totalRead, nil
}
// findStartEntry returns the index of the first entry whose range covers offset.
// Uses the entry cache for O(1) sequential access, block index for O(1) narrowing,
// then bounded binary search. Zero allocations.
func (r *Reader) findStartEntry(offset int64) int {
// Fast path: check if offset is within cached entry
r.cacheMu.Lock()
if r.lastEntryValid && r.lastEntryIdx >= 0 && r.lastEntryIdx < r.entryCount {
if offset >= r.lastEntry.MkvOffset && offset < r.lastEntry.MkvOffset+r.lastEntry.Length {
idx := r.lastEntryIdx
r.cacheMu.Unlock()
return idx
}
}
r.cacheMu.Unlock()
// Use block index to narrow binary search range
var lo, hi int
if r.blockIndex != nil {
blockNum := int(offset / blockSize)
if blockNum >= len(r.blockIndex) {
blockNum = len(r.blockIndex) - 1
}
lo = r.blockIndex[blockNum]
if blockNum+1 < len(r.blockIndex) {
hi = r.blockIndex[blockNum+1] + 1
if hi > r.entryCount {
hi = r.entryCount
}
} else {
hi = r.entryCount
}
} else {
lo = 0
hi = r.entryCount
}
// Binary search within [lo, hi) for first entry whose range covers offset
return lo + sort.Search(hi-lo, func(i int) bool {
mkvOffset, ok := r.getMkvOffset(lo + i)
if !ok {
return true
}
entryLen, ok := r.getEntryLength(lo + i)
if !ok {
return true
}
return mkvOffset+entryLen > offset
})
}
func (r *Reader) findEntriesForRange(offset, length int64) []Entry {
if r.entryCount == 0 {
return nil
}
endOffset := offset + length
// Fast path: check if offset is within cached entry (with lock)
r.cacheMu.Lock()
if r.lastEntryValid && r.lastEntryIdx >= 0 && r.lastEntryIdx < r.entryCount {
if offset >= r.lastEntry.MkvOffset && offset < r.lastEntry.MkvOffset+r.lastEntry.Length {
// Cache hit - start from cached entry
startIdx := r.lastEntryIdx
r.cacheMu.Unlock()
var result []Entry
for i := startIdx; i < r.entryCount; i++ {
entry, ok := r.getEntry(i)
if !ok || entry.MkvOffset >= endOffset {
break
}
result = append(result, entry)
}
return result
}
}
r.cacheMu.Unlock()
// Cache miss - use block index to narrow binary search range
var lo, hi int
if r.blockIndex != nil {
blockNum := int(offset / blockSize)
if blockNum >= len(r.blockIndex) {
blockNum = len(r.blockIndex) - 1
}
lo = r.blockIndex[blockNum]
// Upper bound: start of next block's entries (or entryCount)
if blockNum+1 < len(r.blockIndex) {
// Search up to 1 past the next block's start entry to handle
// entries that span block boundaries
hi = r.blockIndex[blockNum+1] + 1
if hi > r.entryCount {
hi = r.entryCount
}
} else {
hi = r.entryCount
}
} else {
lo = 0
hi = r.entryCount
}
// Binary search within [lo, hi) for first entry whose range covers offset
idx := lo + sort.Search(hi-lo, func(i int) bool {
mkvOffset, ok := r.getMkvOffset(lo + i)
if !ok {
return true
}
entryLen, ok := r.getEntryLength(lo + i)
if !ok {
return true
}
return mkvOffset+entryLen > offset
})
var result []Entry
for i := idx; i < r.entryCount; i++ {
entry, ok := r.getEntry(i)
if !ok || entry.MkvOffset >= endOffset {
break
}
result = append(result, entry)
}
return result
}
// readEntry reads data from the appropriate source for a given entry into dest.
// Handles delta, V4 range map, V1 ES reader, and V3 raw source paths.
func (r *Reader) readEntry(entry Entry, sourceOffset int64, readLen int, dest []byte) error {
if entry.Source == 0 {
// Read from delta section (zero-copy mmap slice)
data, err := r.readDelta(sourceOffset, readLen)
if err != nil {
return err
}
copy(dest, data)
return nil
} else if r.rangeMapsByFile != nil {
// V4: Read via range map directly into output buffer (no allocation)
fileIndex := int(entry.Source - 1)
return r.readViaRangeMapInto(fileIndex, entry, sourceOffset, dest)
} else if r.file.UsesESOffsets && r.esReader != nil {
// V1: Read from ES via external reader
var data []byte
var err error
if entry.IsVideo {
data, err = r.esReader.ReadESData(sourceOffset, readLen, true)
} else {
data, err = r.esReader.ReadAudioSubStreamData(entry.AudioSubStreamID, sourceOffset, readLen)
}
if err != nil {
return err
}
copy(dest, data)
return nil
}
// V3: Read from raw source file
fileIndex := int(entry.Source - 1)
return r.readSourceInto(fileIndex, sourceOffset, dest)
}
// lpcmAlignedRead reads source data for an LPCM entry at the given (already
// pair-aligned) source offset. This is used for the odd-offset case where we
// need to read from one byte before the actual requested offset.
func (r *Reader) lpcmAlignedRead(entry Entry, alignedSrcOff int64, dest []byte) error {
if r.rangeMapsByFile != nil {
fileIndex := int(entry.Source - 1)
return r.readViaRangeMapInto(fileIndex, entry, alignedSrcOff, dest)
}
// V3: Read from raw source file
fileIndex := int(entry.Source - 1)
return r.readSourceInto(fileIndex, alignedSrcOff, dest)
}
func (r *Reader) readDelta(offset int64, size int) ([]byte, error) {
fileOffset := r.file.DeltaOffset + offset
// Zero-copy slice from mmap'd data
data := r.dedupMmap.Slice(fileOffset, size)
if data == nil {
return nil, fmt.Errorf("delta offset out of range")
}
return data, nil
}
// readViaRangeMapInto reads via range map directly into dest, avoiding allocation.
func (r *Reader) readViaRangeMapInto(fileIndex int, entry Entry, sourceOffset int64, dest []byte) error {
src, ok := r.rangeMapsByFile[fileIndex]
if !ok {
return fmt.Errorf("no range map for source file %d", fileIndex)
}
if fileIndex < 0 || fileIndex >= len(r.sourceFiles) || r.sourceFiles[fileIndex] == nil {
return fmt.Errorf("source file %d not loaded for range map read", fileIndex)
}
sf := r.sourceFiles[fileIndex]
if entry.IsVideo {
if src.VideoMap == nil {
return fmt.Errorf("no video range map for source file %d", fileIndex)
}
_, err := src.VideoMap.ReadDataInto(sf, sourceOffset, dest)
return err
}
audioMap, ok := src.AudioMaps[entry.AudioSubStreamID]
if !ok {
return fmt.Errorf("no audio sub-stream %d range map for source file %d", entry.AudioSubStreamID, fileIndex)
}
_, err := audioMap.ReadDataInto(sf, sourceOffset, dest)
return err
}
// readSourceInto reads source file data directly into dest.
func (r *Reader) readSourceInto(fileIndex int, offset int64, dest []byte) error {
if fileIndex < 0 || fileIndex >= len(r.sourceFiles) {
return fmt.Errorf("invalid file index: %d", fileIndex)
}
if r.sourceFiles[fileIndex] == nil {
return fmt.Errorf("source file %d not loaded", fileIndex)
}
n, err := r.sourceFiles[fileIndex].ReadAt(dest, offset)
if n == len(dest) {
if err == io.EOF {
return nil
}
return err
}
// Short read: surface as unexpected EOF so FUSE returns EIO
// instead of silently truncating.
if err == nil || err == io.EOF {
return io.ErrUnexpectedEOF
}
return err
}
// parseHeaderOnly parses just the header and source files (not entries) for fast initialization.
func parseHeaderOnly(r io.Reader) (*File, error) {
file := &File{}
// Read and verify magic
magic := make([]byte, MagicSize)
if _, err := io.ReadFull(r, magic); err != nil {
return nil, fmt.Errorf("read magic: %w", err)
}
if string(magic) != Magic {
return nil, fmt.Errorf("invalid magic: %s", magic)
}
copy(file.Header.Magic[:], magic)
// Read version
if err := binary.Read(r, binary.LittleEndian, &file.Header.Version); err != nil {
return nil, fmt.Errorf("read version: %w", err)
}
// Support versions 3-8. Older versions must be recreated.
switch file.Header.Version {
case Version, VersionRangeMap, VersionCreator, VersionRangeMapCreator,
VersionUsed, VersionRangeMapUsed:
// OK
case 1:
return nil, fmt.Errorf("unsupported version 1 (uses ES offsets); please recreate with 'mkvdup create'")
case 2:
return nil, fmt.Errorf("unsupported version 2 (uses uint8 source index); please recreate with 'mkvdup create'")
default:
return nil, fmt.Errorf("unsupported version: %d (expected 3-8)", file.Header.Version)
}
// Read flags
if err := binary.Read(r, binary.LittleEndian, &file.Header.Flags); err != nil {
return nil, fmt.Errorf("read flags: %w", err)
}
// Read original size
if err := binary.Read(r, binary.LittleEndian, &file.Header.OriginalSize); err != nil {
return nil, fmt.Errorf("read original size: %w", err)
}
// Read original checksum
if err := binary.Read(r, binary.LittleEndian, &file.Header.OriginalChecksum); err != nil {
return nil, fmt.Errorf("read original checksum: %w", err)
}
// Read source type
if err := binary.Read(r, binary.LittleEndian, &file.Header.SourceType); err != nil {
return nil, fmt.Errorf("read source type: %w", err)
}
// Read uses ES offsets flag
if err := binary.Read(r, binary.LittleEndian, &file.Header.UsesESOffsets); err != nil {
return nil, fmt.Errorf("read uses ES offsets: %w", err)
}
file.UsesESOffsets = file.Header.UsesESOffsets == 1
// Read source file count
if err := binary.Read(r, binary.LittleEndian, &file.Header.SourceFileCount); err != nil {
return nil, fmt.Errorf("read source file count: %w", err)
}
// Read entry count
if err := binary.Read(r, binary.LittleEndian, &file.Header.EntryCount); err != nil {
return nil, fmt.Errorf("read entry count: %w", err)
}
// Read delta offset
if err := binary.Read(r, binary.LittleEndian, &file.Header.DeltaOffset); err != nil {
return nil, fmt.Errorf("read delta offset: %w", err)
}
file.DeltaOffset = file.Header.DeltaOffset
// Read delta size
if err := binary.Read(r, binary.LittleEndian, &file.Header.DeltaSize); err != nil {
return nil, fmt.Errorf("read delta size: %w", err)
}
// Read creator version string (V5/V6 only)
file.headerSize = int64(HeaderSize)
if file.Header.Version >= VersionCreator {
var versionLen uint16
if err := binary.Read(r, binary.LittleEndian, &versionLen); err != nil {
return nil, fmt.Errorf("read creator version length: %w", err)
}
if versionLen > MaxCreatorVersionLen {
return nil, fmt.Errorf("creator version length %d exceeds maximum (%d)", versionLen, MaxCreatorVersionLen)
}
if versionLen > 0 {
versionBytes := make([]byte, versionLen)
if _, err := io.ReadFull(r, versionBytes); err != nil {
return nil, fmt.Errorf("read creator version: %w", err)
}
file.CreatorVersion = string(versionBytes)
}
file.headerSize = int64(HeaderSize) + 2 + int64(versionLen)
}
// Read source files
file.SourceFiles = make([]SourceFile, file.Header.SourceFileCount)
for i := range file.SourceFiles {
var pathLen uint16
if err := binary.Read(r, binary.LittleEndian, &pathLen); err != nil {
return nil, fmt.Errorf("read path length: %w", err)
}
path := make([]byte, pathLen)
if _, err := io.ReadFull(r, path); err != nil {
return nil, fmt.Errorf("read path: %w", err)
}
file.SourceFiles[i].RelativePath = string(path)
if err := binary.Read(r, binary.LittleEndian, &file.SourceFiles[i].Size); err != nil {
return nil, fmt.Errorf("read file size: %w", err)
}
if err := binary.Read(r, binary.LittleEndian, &file.SourceFiles[i].Checksum); err != nil {
return nil, fmt.Errorf("read file checksum: %w", err)
}
// V7/V8: read used flag
if file.Header.Version == VersionUsed || file.Header.Version == VersionRangeMapUsed {
var used uint8
if err := binary.Read(r, binary.LittleEndian, &used); err != nil {
return nil, fmt.Errorf("read file used flag: %w", err)
}
file.SourceFiles[i].Used = used == 1
}
}
// Entries are accessed directly from mmap via Reader.getEntry()
return file, nil
}
// VerifyIntegrity verifies the dedup file checksums.
func (r *Reader) VerifyIntegrity() error {
// Initialize entry access to get entryCount
if err := r.initEntryAccess(); err != nil {
return fmt.Errorf("init entry access: %w", err)
}
footerSz := int64(FooterSize)
if r.hasRangeMaps() {
footerSz = int64(FooterV4Size)
}
fileSize := r.dedupMmap.Size()
// Read footer from mmap
footerOffset := fileSize - footerSz
footerData := r.dedupMmap.Slice(footerOffset, int(footerSz))
if footerData == nil {
return fmt.Errorf("footer slice out of range")
}
var footer Footer
off := 0
footer.IndexChecksum = binary.LittleEndian.Uint64(footerData[off : off+8])
off += 8
footer.DeltaChecksum = binary.LittleEndian.Uint64(footerData[off : off+8])
off += 8
if r.hasRangeMaps() {
footer.RangeMapChecksum = binary.LittleEndian.Uint64(footerData[off : off+8])
off += 8
}
if string(footerData[off:off+MagicSize]) != Magic {
return fmt.Errorf("invalid footer magic")
}
// Calculate and verify index checksum (zero-copy)
indexSize := int(int64(r.entryCount) * EntrySize)
indexData := r.dedupMmap.Slice(r.indexStart, indexSize)
if indexData == nil {
return fmt.Errorf("read index for checksum: slice out of range")
}
if xxhash.Sum64(indexData) != footer.IndexChecksum {
return fmt.Errorf("index checksum mismatch")
}
// Calculate and verify delta checksum (zero-copy)
deltaData := r.dedupMmap.Slice(r.file.DeltaOffset, int(r.file.Header.DeltaSize))
if deltaData == nil {
return fmt.Errorf("read delta for checksum: slice out of range")
}
if xxhash.Sum64(deltaData) != footer.DeltaChecksum {
return fmt.Errorf("delta checksum mismatch")
}
// V4/V6: verify range map checksum
if r.hasRangeMaps() {
rangeMapOffset := r.file.DeltaOffset + r.file.Header.DeltaSize
rangeMapSize := int(footerOffset - rangeMapOffset)
if rangeMapSize > 0 {
rangeMapData := r.dedupMmap.Slice(rangeMapOffset, rangeMapSize)
if rangeMapData == nil {
return fmt.Errorf("read range map for checksum: slice out of range")
}
if xxhash.Sum64(rangeMapData) != footer.RangeMapChecksum {
return fmt.Errorf("range map checksum mismatch")
}
}
}
return nil
}
func (r *Reader) calculateSourceFilesSize() int64 {
var size int64
hasUsed := r.HasSourceUsedFlags()
for _, sf := range r.file.SourceFiles {
size += 2 + int64(len(sf.RelativePath)) + 8 + 8
if hasUsed {
size += 1
}
}
return size
}
// Info returns a summary of the dedup file.
// If entry access initialization failed, the "error" key will contain the error message
// and "entry_count" will be 0.
func (r *Reader) Info() map[string]any {
err := r.initEntryAccess() // Ensure entryCount is initialized
info := map[string]any{
"version": r.file.Header.Version,
"original_size": r.file.Header.OriginalSize,
"original_checksum": r.file.Header.OriginalChecksum,
"source_type": r.file.Header.SourceType,
"uses_es_offsets": r.file.UsesESOffsets,
"has_range_maps": r.rangeMapsByFile != nil,
"source_file_count": len(r.file.SourceFiles),
"entry_count": r.entryCount,
"delta_size": r.file.Header.DeltaSize,
"creator_version": r.file.CreatorVersion,
}
if err != nil {
info["error"] = err.Error()
}
return info
}
package dedup
// stridedCopy copies count blocks of payloadSize bytes from src into
// contiguous dst. Blocks in src are separated by stride bytes
// (stride >= payloadSize). This avoids per-block copy() call overhead
// when extracting many small payloads (e.g. 184-byte M2TS PES payloads
// at 192-byte stride).
func stridedCopy(dst, src []byte, count, payloadSize, stride int) {
dp := 0
sp := 0
for i := 0; i < count; i++ {
copy(dst[dp:dp+payloadSize], src[sp:sp+payloadSize])
dp += payloadSize
sp += stride
}
}
package dedup
import (
"bufio"
"encoding/binary"
"fmt"
"io"
"os"
"github.com/cespare/xxhash/v2"
"github.com/stuckj/mkvdup/internal/matcher"
"github.com/stuckj/mkvdup/internal/source"
)
// Writer creates .mkvdup files.
type Writer struct {
file *os.File
header Header
sourceFiles []SourceFile
entries []Entry
deltaData []byte // In-memory delta (for tests / small files)
deltaFile *matcher.DeltaWriter // File-backed delta (for large files)
rangeMaps []RangeMapData // V4/V6: per-source-file range maps (nil for V3/V5)
rangeMapBuf []byte // Pre-encoded range map section (set by EncodeRangeMaps)
creatorVersion string // Version string to embed in the file
}
// NewWriter creates a new dedup file writer.
func NewWriter(path string) (*Writer, error) {
f, err := os.Create(path)
if err != nil {
return nil, fmt.Errorf("create file: %w", err)
}
return &Writer{file: f}, nil
}
// SetCreatorVersion sets the version string to embed in the file.
// When set, the writer produces V7 (or V8 if range maps are also set).
func (w *Writer) SetCreatorVersion(v string) {
if len(v) > MaxCreatorVersionLen {
v = v[:MaxCreatorVersionLen]
}
w.creatorVersion = v
}
// SetHeader sets the header information.
func (w *Writer) SetHeader(originalSize int64, originalChecksum uint64, sourceType source.Type) {
copy(w.header.Magic[:], Magic)
w.header.Version = Version // Default V3; upgraded to V7/V8 in resolveVersion()
w.header.Flags = 0
w.header.OriginalSize = originalSize
w.header.OriginalChecksum = originalChecksum
w.header.UsesESOffsets = 0 // v2 always uses raw offsets
switch sourceType {
case source.TypeDVD:
w.header.SourceType = SourceTypeDVD
case source.TypeBluray:
w.header.SourceType = SourceTypeBluray
}
}
// SetSourceFiles sets the source file list.
func (w *Writer) SetSourceFiles(files []source.File) {
w.sourceFiles = make([]SourceFile, len(files))
for i, sf := range files {
w.sourceFiles[i] = ToSourceFile(sf)
}
w.header.SourceFileCount = uint16(len(files))
}
// SetRangeMaps sets the range map data for V4 format.
// When range maps are set, ES-offset entries are preserved (not converted to raw offsets)
// and a range map section is written to the dedup file for mapping ES offsets to
// raw file positions at read time.
func (w *Writer) SetRangeMaps(rangeMaps []RangeMapData) {
w.rangeMaps = rangeMaps
w.header.Version = VersionRangeMap // Default V4; upgraded to V8 in resolveVersion()
w.header.UsesESOffsets = 1
}
// resolveVersion sets the final file version based on configured features.
func (w *Writer) resolveVersion() {
if w.rangeMaps != nil {
if w.creatorVersion != "" {
w.header.Version = VersionRangeMapUsed // V8
} else {
w.header.Version = VersionRangeMap // V4
}
} else {
if w.creatorVersion != "" {
w.header.Version = VersionUsed // V7
} else {
w.header.Version = Version // V3
}
}
}
// computeUsedFlags scans entries and marks which source files are referenced.
func (w *Writer) computeUsedFlags() {
for i := range w.sourceFiles {
w.sourceFiles[i].Used = false
}
for _, e := range w.entries {
if e.Source > 0 {
idx := int(e.Source - 1)
if idx < len(w.sourceFiles) {
w.sourceFiles[idx].Used = true
}
}
}
}
// EncodeRangeMaps pre-encodes the range map section. Call this before
// WriteWithProgress to avoid a CPU-intensive encoding phase with no progress
// output. Returns the encoded size. If range maps are nil, this is a no-op.
func (w *Writer) EncodeRangeMaps() (int64, error) {
if w.rangeMaps == nil {
return 0, nil
}
buf, err := encodeRangeMapSection(w.rangeMaps)
if err != nil {
return 0, fmt.Errorf("encode range maps: %w", err)
}
w.rangeMapBuf = buf
return int64(len(buf)), nil
}
// SetMatchResult sets the match result (entries and delta).
// If esConverters is provided and non-empty, ES-offset entries will be converted
// to raw-offset entries, potentially splitting entries that span multiple ranges.
func (w *Writer) SetMatchResult(result *matcher.Result, esConverters []source.ESRangeConverter) error {
// Convert matcher entries to dedup entries
entries := make([]Entry, len(result.Entries))
for i, e := range result.Entries {
entries[i] = FromMatcherEntry(e)
}
// Convert ES offsets to raw offsets if we have converters.
// Skip conversion for V4 (range maps handle the mapping at read time).
if len(esConverters) > 0 && w.rangeMaps == nil {
var err error
entries, err = w.convertESToRawOffsets(entries, esConverters)
if err != nil {
return fmt.Errorf("convert ES to raw offsets: %w", err)
}
}
w.entries = entries
w.header.EntryCount = uint64(len(w.entries))
if result.DeltaFile != nil {
w.deltaFile = result.DeltaFile
w.header.DeltaSize = result.DeltaFile.Size()
} else {
w.deltaData = result.DeltaData
w.header.DeltaSize = int64(len(result.DeltaData))
}
return nil
}
// convertESToRawOffsets converts ES-offset entries to raw-offset entries.
// Entries that span multiple PES payload ranges are split into multiple entries.
func (w *Writer) convertESToRawOffsets(entries []Entry, esConverters []source.ESRangeConverter) ([]Entry, error) {
// Pre-allocate with ~2x capacity since entries typically expand to multiple raw ranges
result := make([]Entry, 0, len(entries)*2)
for _, entry := range entries {
if entry.Source == 0 {
// Delta entry - no conversion needed
result = append(result, entry)
continue
}
// Get the ES converter for this source file
fileIndex := int(entry.Source - 1)
if fileIndex >= len(esConverters) || esConverters[fileIndex] == nil {
// No converter available - assume raw offsets already
result = append(result, entry)
continue
}
converter := esConverters[fileIndex]
// Get raw ranges for this ES region
var rawRanges []source.RawRange
var err error
if entry.IsVideo {
rawRanges, err = converter.RawRangesForESRegion(entry.SourceOffset, int(entry.Length), true)
} else {
rawRanges, err = converter.RawRangesForAudioSubStream(entry.AudioSubStreamID, entry.SourceOffset, int(entry.Length))
}
if err != nil {
return nil, fmt.Errorf("convert entry at MKV offset %d: %w", entry.MkvOffset, err)
}
// Create one entry per raw range
mkvOffset := entry.MkvOffset
for _, rr := range rawRanges {
result = append(result, Entry{
MkvOffset: mkvOffset,
Length: int64(rr.Size),
Source: entry.Source,
SourceOffset: rr.FileOffset, // Raw file offset!
IsVideo: entry.IsVideo,
AudioSubStreamID: entry.AudioSubStreamID,
IsLPCM: entry.IsLPCM,
})
mkvOffset += int64(rr.Size)
}
}
return result, nil
}
// WriteProgressFunc is called to report write progress.
type WriteProgressFunc func(written, total int64)
// Write writes the dedup file.
func (w *Writer) Write() error {
return w.WriteWithProgress(nil)
}
// WriteWithProgress writes the dedup file with progress reporting.
func (w *Writer) WriteWithProgress(progress WriteProgressFunc) error {
// Scan entries to compute per-source Used flags, then determine file version.
w.computeUsedFlags()
w.resolveVersion()
// Use pre-encoded range maps if available (from EncodeRangeMaps),
// otherwise encode now.
rangeMapBuf := w.rangeMapBuf
if rangeMapBuf == nil && w.rangeMaps != nil {
var err error
rangeMapBuf, err = encodeRangeMapSection(w.rangeMaps)
if err != nil {
return fmt.Errorf("encode range maps: %w", err)
}
}
// Calculate offsets and total size
sourceFilesSize := w.calculateSourceFilesSize()
cvSize := creatorVersionSize(w.creatorVersion)
indexSize := int64(len(w.entries)) * EntrySize
deltaOffset := int64(HeaderSize) + cvSize + sourceFilesSize + indexSize
w.header.DeltaOffset = deltaOffset
footerSize := int64(FooterSize)
if rangeMapBuf != nil {
footerSize = FooterV4Size
}
totalSize := deltaOffset + w.header.DeltaSize + int64(len(rangeMapBuf)) + footerSize
var written int64
// Write header (includes creator version for V5/V6)
if err := w.writeHeader(); err != nil {
return fmt.Errorf("write header: %w", err)
}
written += int64(HeaderSize) + cvSize
// Write source files section
if err := w.writeSourceFiles(); err != nil {
return fmt.Errorf("write source files: %w", err)
}
written += sourceFilesSize
// Write index entries and calculate checksum
indexChecksum, err := w.writeEntriesWithProgress(progress, &written, totalSize)
if err != nil {
return fmt.Errorf("write entries: %w", err)
}
// Write delta data and calculate checksum
deltaChecksum, err := w.writeDeltaWithProgress(progress, &written, totalSize)
if err != nil {
return fmt.Errorf("write delta: %w", err)
}
// Write range map section (V4 only)
var rangeMapChecksum uint64
if rangeMapBuf != nil {
rangeMapChecksum, err = writeRangeMapSection(w.file, rangeMapBuf)
if err != nil {
return fmt.Errorf("write range map: %w", err)
}
written += int64(len(rangeMapBuf))
if progress != nil {
progress(written, totalSize)
}
}
// Write footer
if err := w.writeFooter(indexChecksum, deltaChecksum, rangeMapChecksum); err != nil {
return fmt.Errorf("write footer: %w", err)
}
if progress != nil {
progress(totalSize, totalSize)
}
return nil
}
// Close closes the writer.
func (w *Writer) Close() error {
if w.file != nil {
return w.file.Close()
}
return nil
}
func (w *Writer) calculateSourceFilesSize() int64 {
var size int64
hasUsed := w.header.Version == VersionUsed || w.header.Version == VersionRangeMapUsed
for _, sf := range w.sourceFiles {
// PathLen (2) + Path (variable) + Size (8) + Checksum (8) [+ Used (1)]
size += 2 + int64(len(sf.RelativePath)) + 8 + 8
if hasUsed {
size += 1
}
}
return size
}
func (w *Writer) writeHeader() error {
// Write magic
if _, err := w.file.Write([]byte(Magic)); err != nil {
return err
}
// Write version
if err := binary.Write(w.file, binary.LittleEndian, w.header.Version); err != nil {
return err
}
// Write flags
if err := binary.Write(w.file, binary.LittleEndian, w.header.Flags); err != nil {
return err
}
// Write original size
if err := binary.Write(w.file, binary.LittleEndian, w.header.OriginalSize); err != nil {
return err
}
// Write original checksum
if err := binary.Write(w.file, binary.LittleEndian, w.header.OriginalChecksum); err != nil {
return err
}
// Write source type
if err := binary.Write(w.file, binary.LittleEndian, w.header.SourceType); err != nil {
return err
}
// Write uses ES offsets flag
if err := binary.Write(w.file, binary.LittleEndian, w.header.UsesESOffsets); err != nil {
return err
}
// Write source file count
if err := binary.Write(w.file, binary.LittleEndian, w.header.SourceFileCount); err != nil {
return err
}
// Write entry count
if err := binary.Write(w.file, binary.LittleEndian, w.header.EntryCount); err != nil {
return err
}
// Write delta offset
if err := binary.Write(w.file, binary.LittleEndian, w.header.DeltaOffset); err != nil {
return err
}
// Write delta size
if err := binary.Write(w.file, binary.LittleEndian, w.header.DeltaSize); err != nil {
return err
}
// Write creator version string (V5/V6)
if w.creatorVersion != "" {
versionLen := uint16(len(w.creatorVersion))
if err := binary.Write(w.file, binary.LittleEndian, versionLen); err != nil {
return err
}
if _, err := w.file.Write([]byte(w.creatorVersion)); err != nil {
return err
}
}
return nil
}
func (w *Writer) writeSourceFiles() error {
hasUsed := w.header.Version == VersionUsed || w.header.Version == VersionRangeMapUsed
for _, sf := range w.sourceFiles {
// Write path length
pathLen := uint16(len(sf.RelativePath))
if err := binary.Write(w.file, binary.LittleEndian, pathLen); err != nil {
return err
}
// Write path
if _, err := w.file.Write([]byte(sf.RelativePath)); err != nil {
return err
}
// Write size
if err := binary.Write(w.file, binary.LittleEndian, sf.Size); err != nil {
return err
}
// Write checksum
if err := binary.Write(w.file, binary.LittleEndian, sf.Checksum); err != nil {
return err
}
// Write used flag (V7/V8)
if hasUsed {
var used uint8
if sf.Used {
used = 1
}
if err := binary.Write(w.file, binary.LittleEndian, used); err != nil {
return err
}
}
}
return nil
}
func (w *Writer) writeEntriesWithProgress(progress WriteProgressFunc, written *int64, total int64) (uint64, error) {
hasher := xxhash.New()
// Use buffered writer to batch syscalls (64KB buffer)
bufWriter := bufio.NewWriterSize(w.file, 64*1024)
writer := io.MultiWriter(bufWriter, hasher)
entryCount := len(w.entries)
lastProgress := 0
// Reusable buffer for entry serialization (allocation-free per entry)
var entryBuf [EntrySize]byte
for i, entry := range w.entries {
// Serialize entry to buffer using allocation-free Put* functions
binary.LittleEndian.PutUint64(entryBuf[0:8], uint64(entry.MkvOffset))
binary.LittleEndian.PutUint64(entryBuf[8:16], uint64(entry.Length))
binary.LittleEndian.PutUint16(entryBuf[16:18], entry.Source)
binary.LittleEndian.PutUint64(entryBuf[18:26], uint64(entry.SourceOffset))
// ES flags byte: bit 0 = IsVideo, bit 1 = IsLPCM
var esFlags uint8
if entry.IsVideo {
esFlags |= 1
}
if entry.IsLPCM {
esFlags |= 2
}
entryBuf[26] = esFlags
entryBuf[27] = entry.AudioSubStreamID
// Single write per entry
if _, err := writer.Write(entryBuf[:]); err != nil {
return 0, err
}
*written += EntrySize
// Report progress every 1% or 10000 entries
if progress != nil && entryCount > 0 {
pct := (i + 1) * 100 / entryCount
if pct > lastProgress || (i+1)%10000 == 0 {
progress(*written, total)
lastProgress = pct
}
}
}
// Flush buffered writer
if err := bufWriter.Flush(); err != nil {
return 0, err
}
return hasher.Sum64(), nil
}
func (w *Writer) writeDeltaWithProgress(progress WriteProgressFunc, written *int64, total int64) (uint64, error) {
hasher := xxhash.New()
const chunkSize = 64 * 1024 // 64KB chunks
lastProgress := 0
if w.deltaFile != nil {
// Read from temp file and write to output
f := w.deltaFile.File()
if _, err := f.Seek(0, 0); err != nil {
return 0, fmt.Errorf("seek delta file: %w", err)
}
buf := make([]byte, chunkSize)
for {
n, err := f.Read(buf)
if n > 0 {
chunk := buf[:n]
if _, werr := w.file.Write(chunk); werr != nil {
return 0, werr
}
hasher.Write(chunk)
*written += int64(n)
if progress != nil && w.header.DeltaSize > 0 {
pct := int((*written * 100) / total)
if pct > lastProgress {
progress(*written, total)
lastProgress = pct
}
}
}
if err == io.EOF {
break
}
if err != nil {
return 0, err
}
}
} else {
// In-memory path (for tests / small files)
data := w.deltaData
for len(data) > 0 {
chunk := data
if len(chunk) > chunkSize {
chunk = data[:chunkSize]
}
data = data[len(chunk):]
if _, err := w.file.Write(chunk); err != nil {
return 0, err
}
hasher.Write(chunk)
*written += int64(len(chunk))
if progress != nil && w.header.DeltaSize > 0 {
pct := int((*written * 100) / total)
if pct > lastProgress {
progress(*written, total)
lastProgress = pct
}
}
}
}
return hasher.Sum64(), nil
}
func (w *Writer) writeFooter(indexChecksum, deltaChecksum, rangeMapChecksum uint64) error {
// Write index checksum
if err := binary.Write(w.file, binary.LittleEndian, indexChecksum); err != nil {
return err
}
// Write delta checksum
if err := binary.Write(w.file, binary.LittleEndian, deltaChecksum); err != nil {
return err
}
// Write range map checksum (V4 only)
if w.rangeMaps != nil {
if err := binary.Write(w.file, binary.LittleEndian, rangeMapChecksum); err != nil {
return err
}
}
// Write magic
if _, err := w.file.Write([]byte(Magic)); err != nil {
return err
}
return nil
}
package fuse
import (
"fmt"
"path/filepath"
"time"
"github.com/stuckj/mkvdup/internal/dedup"
"github.com/stuckj/mkvdup/internal/security"
"github.com/stuckj/mkvdup/internal/source"
)
// Ensure adapters implement interfaces
var _ ReaderInitializer = (*dedupReaderAdapter)(nil)
var _ ReaderFactory = (*DefaultReaderFactory)(nil)
var _ ConfigReader = (*DefaultConfigReader)(nil)
// dedupReaderAdapter wraps dedup.Reader to implement ReaderInitializer interface.
type dedupReaderAdapter struct {
reader *dedup.Reader
readTimeout time.Duration // pread timeout for network FS sources
// index stores the source index for cleanup when using ES offsets.
// This is nil when using raw source files.
index *source.Index
}
func (a *dedupReaderAdapter) OriginalSize() int64 {
return a.reader.OriginalSize()
}
func (a *dedupReaderAdapter) UsesESOffsets() bool {
return a.reader.UsesESOffsets()
}
func (a *dedupReaderAdapter) InitializeForReading(sourceDir string) error {
if a.reader.UsesESOffsets() && !a.reader.HasRangeMaps() {
// Legacy guard: ES offsets without range maps would need a full
// ES reader. No current format hits this path — DVD formats
// (V3/V5/V7) use raw file offsets, and Blu-ray formats (V4/V6/V8)
// always have range maps. Kept for safety against future formats.
indexer, err := source.NewIndexer(sourceDir, source.DefaultWindowSize)
if err != nil {
return fmt.Errorf("create indexer: %w", err)
}
if err := indexer.Build(nil); err != nil {
return fmt.Errorf("build index: %w", err)
}
index := indexer.Index()
if len(index.ESReaders) > 0 {
a.reader.SetESReader(index.ESReaders[0])
}
// Store index for cleanup in Close()
a.index = index
} else if isNetworkFS(sourceDir) {
// Network FS: use pread with retry instead of mmap to avoid SIGBUS.
if err := a.reader.LoadSourceFilesPread(a.readTimeout); err != nil {
return fmt.Errorf("load source files (pread): %w", err)
}
} else {
// Local FS: mmap for zero-copy performance.
// Range maps handle ES-to-raw translation at read time.
if err := a.reader.LoadSourceFiles(); err != nil {
return fmt.Errorf("load source files: %w", err)
}
}
return nil
}
func (a *dedupReaderAdapter) SourceFileInfo() []SourceFileInfo {
sourceFiles := a.reader.SourceFiles()
hasUsedFlags := a.reader.HasSourceUsedFlags()
var infos []SourceFileInfo
for _, sf := range sourceFiles {
if hasUsedFlags && !sf.Used {
continue
}
infos = append(infos, SourceFileInfo{
RelativePath: sf.RelativePath,
Size: sf.Size,
Checksum: sf.Checksum,
})
}
return infos
}
func (a *dedupReaderAdapter) ReadAt(p []byte, off int64) (n int, err error) {
return a.reader.ReadAt(p, off)
}
func (a *dedupReaderAdapter) Close() error {
var errs []error
if err := a.reader.Close(); err != nil {
errs = append(errs, err)
}
if a.index != nil {
if err := a.index.Close(); err != nil {
errs = append(errs, err)
}
}
if len(errs) > 0 {
return errs[0]
}
return nil
}
// DefaultReaderFactory is the default implementation of ReaderFactory.
type DefaultReaderFactory struct {
ReadTimeout time.Duration // pread timeout for network FS sources
}
func (f *DefaultReaderFactory) NewReaderLazy(dedupPath, sourceDir string) (ReaderInitializer, error) {
// When running as root, resolve symlinks once and use the canonical
// paths for both security checks and subsequent opens. This closes
// the TOCTOU window where a symlink could be swapped between the
// ownership check and the actual open/mmap. We use the Resolved
// variants to avoid redundant EvalSymlinks calls inside the
// security functions.
if security.Geteuid() == 0 {
resolved, err := filepath.EvalSymlinks(dedupPath)
if err != nil {
return nil, fmt.Errorf("resolve dedup path %s: %w", dedupPath, err)
}
dedupPath = resolved
resolved, err = filepath.EvalSymlinks(sourceDir)
if err != nil {
return nil, fmt.Errorf("resolve source dir %s: %w", sourceDir, err)
}
sourceDir = resolved
}
if err := security.CheckFileOwnershipResolved(dedupPath); err != nil {
return nil, fmt.Errorf("dedup file %s: %w", dedupPath, err)
}
if err := security.CheckDirectoryResolved(sourceDir); err != nil {
return nil, fmt.Errorf("source dir %s: %w", sourceDir, err)
}
reader, err := dedup.NewReaderLazy(dedupPath, sourceDir)
if err != nil {
return nil, err
}
return &dedupReaderAdapter{reader: reader, readTimeout: f.ReadTimeout}, nil
}
// DefaultConfigReader is the default implementation of ConfigReader.
type DefaultConfigReader struct{}
func (r *DefaultConfigReader) ReadConfig(path string) (*Config, error) {
config, err := dedup.ReadConfig(path)
if err != nil {
return nil, err
}
return &Config{
Name: config.Name,
DedupFile: config.DedupFile,
SourceDir: config.SourceDir,
}, nil
}
package fuse
import (
"fmt"
"os"
"path/filepath"
"sync"
"time"
"github.com/fsnotify/fsnotify"
)
// configDebounceDelay is the time to wait after the last config file event
// before triggering the action. This coalesces rapid changes from editors
// that write to a temp file and then rename (atomic save).
const configDebounceDelay = 500 * time.Millisecond
// ConfigWatcher monitors config files for changes and either logs a warning
// or triggers a reload callback. It uses inotify for local filesystems and
// falls back to polling for network filesystems (NFS, CIFS/SMB).
type ConfigWatcher struct {
watcher *fsnotify.Watcher
// configFiles is the set of absolute config file paths being watched.
configFiles map[string]bool
// pollFiles maps absolute config file paths to their last known mtime
// for directories that use polling instead of inotify.
pollFiles map[string]time.Time
action string // "reload" or "warn"
reloadFn func()
logFn func(string, ...interface{})
pollInterval time.Duration
mu sync.Mutex
stopCh chan struct{}
wg sync.WaitGroup
}
// NewConfigWatcher creates a new config file watcher with the given action.
// action must be "reload" or "warn".
// If pollInterval <= 0, defaultPollInterval is used.
// The watcher is not started until Start() is called.
func NewConfigWatcher(action string, pollInterval time.Duration, reloadFn func(), logFn func(string, ...interface{})) (*ConfigWatcher, error) {
switch action {
case "reload", "warn":
default:
return nil, fmt.Errorf("invalid config watch action %q (must be reload or warn)", action)
}
watcher, err := fsnotify.NewWatcher()
if err != nil {
return nil, err
}
if logFn == nil {
logFn = func(string, ...interface{}) {}
}
if pollInterval <= 0 {
pollInterval = defaultPollInterval
}
return &ConfigWatcher{
watcher: watcher,
configFiles: make(map[string]bool),
pollFiles: make(map[string]time.Time),
action: action,
reloadFn: reloadFn,
logFn: logFn,
pollInterval: pollInterval,
stopCh: make(chan struct{}),
}, nil
}
// Update replaces the set of watched config files. It removes old watches
// and sets up new ones. Called on mount and after reload.
func (cw *ConfigWatcher) Update(configPaths []string) {
// Build new file set and directory set.
newFiles := make(map[string]bool, len(configPaths))
watchDirs := make(map[string]bool)
for _, p := range configPaths {
abs, err := filepath.Abs(p)
if err != nil {
cw.logFn("config-watch: warning: cannot resolve %s: %v", p, err)
continue
}
newFiles[abs] = true
watchDirs[filepath.Dir(abs)] = true
}
cw.mu.Lock()
// Remove old inotify watches.
oldDirs := make(map[string]bool)
for f := range cw.configFiles {
oldDirs[filepath.Dir(f)] = true
}
cw.configFiles = newFiles
cw.pollFiles = make(map[string]time.Time)
cw.mu.Unlock()
// Remove old watches (fsnotify methods are thread-safe).
for dir := range oldDirs {
cw.watcher.Remove(dir)
}
// Precompute files per directory for efficient poll setup.
pathsByDir := make(map[string][]string)
for f := range newFiles {
dir := filepath.Dir(f)
pathsByDir[dir] = append(pathsByDir[dir], f)
}
// Set up new watches.
newPollFiles := make(map[string]time.Time)
for dir := range watchDirs {
if isNetworkFS(dir) {
cw.logFn("config-watch: %s is on a network filesystem, using polling", dir)
for _, absPath := range pathsByDir[dir] {
if info, err := os.Stat(absPath); err == nil {
newPollFiles[absPath] = info.ModTime()
} else {
newPollFiles[absPath] = time.Time{}
}
}
} else {
if err := cw.watcher.Add(dir); err != nil {
cw.logFn("config-watch: warning: cannot watch %s: %v", dir, err)
}
}
}
if len(newPollFiles) > 0 {
cw.mu.Lock()
cw.pollFiles = newPollFiles
cw.mu.Unlock()
}
cw.logFn("config-watch: monitoring %d config files in %d directories (action=%s)",
len(newFiles), len(watchDirs), cw.action)
}
// Start begins the event processing loops. Must be called after Update().
func (cw *ConfigWatcher) Start() {
cw.wg.Add(1)
go cw.eventLoop()
cw.wg.Add(1)
go cw.pollLoop()
}
// Stop stops the watcher and waits for goroutines to exit.
func (cw *ConfigWatcher) Stop() {
close(cw.stopCh)
cw.watcher.Close()
cw.wg.Wait()
}
// eventLoop processes fsnotify events with debouncing.
func (cw *ConfigWatcher) eventLoop() {
defer cw.wg.Done()
// Single timer reused across events. Starts stopped; Reset activates it.
debounceTimer := time.NewTimer(0)
if !debounceTimer.Stop() {
<-debounceTimer.C
}
defer debounceTimer.Stop()
for {
select {
case event, ok := <-cw.watcher.Events:
if !ok {
return
}
if event.Op&(fsnotify.Write|fsnotify.Create|fsnotify.Rename|fsnotify.Remove) == 0 {
continue
}
// Check if this event is for a tracked config file.
cw.mu.Lock()
tracked := cw.configFiles[event.Name]
cw.mu.Unlock()
if !tracked {
continue
}
// Reset debounce timer — drain channel if Stop reports
// the timer already fired to prevent a stale tick.
if !debounceTimer.Stop() {
select {
case <-debounceTimer.C:
default:
}
}
debounceTimer.Reset(configDebounceDelay)
case <-debounceTimer.C:
// Guard against select choosing the timer case when stopCh
// is also ready — prefer shutdown over triggering a reload.
select {
case <-cw.stopCh:
return
default:
cw.triggerAction()
}
case err, ok := <-cw.watcher.Errors:
if !ok {
return
}
cw.logFn("config-watch: watcher error: %v", err)
case <-cw.stopCh:
return
}
}
}
// pollLoop periodically checks config files on network filesystems.
func (cw *ConfigWatcher) pollLoop() {
defer cw.wg.Done()
ticker := time.NewTicker(cw.pollInterval)
defer ticker.Stop()
for {
select {
case <-ticker.C:
select {
case <-cw.stopCh:
return
default:
cw.pollCheck()
}
case <-cw.stopCh:
return
}
}
}
// pollCheck stats all poll-monitored config files and triggers action if
// any have changed.
func (cw *ConfigWatcher) pollCheck() {
type polledFile struct {
path string
lastMtime time.Time
}
cw.mu.Lock()
snapshot := make([]polledFile, 0, len(cw.pollFiles))
for absPath, lastMtime := range cw.pollFiles {
snapshot = append(snapshot, polledFile{path: absPath, lastMtime: lastMtime})
}
cw.mu.Unlock()
type mtimeUpdate struct {
path string
newMtime time.Time
}
var updates []mtimeUpdate
changed := false
for _, pf := range snapshot {
info, err := os.Stat(pf.path)
if err != nil {
cw.logFn("config-watch: poll: cannot stat %s: %v", pf.path, err)
// Only treat as a change on transition into error/missing state.
// This prevents repeated reload triggers every poll tick when a
// config file is persistently unreachable.
if !pf.lastMtime.IsZero() {
updates = append(updates, mtimeUpdate{path: pf.path, newMtime: time.Time{}})
changed = true
}
continue
}
if !info.ModTime().Equal(pf.lastMtime) {
updates = append(updates, mtimeUpdate{path: pf.path, newMtime: info.ModTime()})
changed = true
}
}
if len(updates) > 0 {
cw.mu.Lock()
for _, u := range updates {
if _, ok := cw.pollFiles[u.path]; ok {
cw.pollFiles[u.path] = u.newMtime
}
}
cw.mu.Unlock()
}
if changed {
cw.triggerAction()
}
}
// triggerAction executes the configured action (warn or reload).
func (cw *ConfigWatcher) triggerAction() {
switch cw.action {
case "warn":
cw.logFn("config-watch: config file changed (action=warn)")
case "reload":
cw.logFn("config-watch: config file changed, triggering reload")
cw.reloadFn()
}
}
// Package fuse provides a FUSE filesystem for accessing deduplicated MKV files.
package fuse
import (
"sync"
"sync/atomic"
"github.com/hanwen/go-fuse/v2/fs"
)
// MKVFile represents a virtual MKV file backed by a dedup file.
type MKVFile struct {
Name string
DedupPath string
SourceDir string
Size int64
reader DedupReader
mu sync.RWMutex
// disabled is set when a source file change is detected and the
// configured action is "disable" or "checksum" (with mismatch).
// When true, Open/Read return EIO. Reset to false on reload.
disabled bool
// Factory for lazy initialization (injected from root)
readerFactory ReaderFactory
}
// MKVFSRoot is the root node of the FUSE filesystem.
type MKVFSRoot struct {
fs.Inode
// Directory tree for hierarchical file organization
rootDir *MKVFSDirNode
// Flat map for O(1) lookup by full path (kept for backwards compatibility)
files map[string]*MKVFile
mu sync.RWMutex
verbose bool
// mounted is set to true after fs.Mount() succeeds. FUSE kernel
// notifications (NotifyDelete, NotifyEntry, NotifyContent) are only
// safe to call when the filesystem is mounted — the go-fuse bridge
// is nil before mount, causing panics.
mounted atomic.Bool
// Factories for dependency injection (allows mocking in tests)
readerFactory ReaderFactory
configReader ConfigReader
// Permission store for chmod/chown support
permStore *PermissionStore
}
// MKVFSNode represents a file node in the FUSE filesystem.
type MKVFSNode struct {
fs.Inode
file *MKVFile
path string // full path for permission lookups
verbose bool
permStore *PermissionStore
}
// MKVFSDirNode represents a directory node in the FUSE filesystem.
type MKVFSDirNode struct {
fs.Inode
name string // basename (e.g., "Action")
path string // full path from root (e.g., "Movies/Action")
files map[string]*MKVFile // files directly in this directory
subdirs map[string]*MKVFSDirNode // child directories
mu sync.RWMutex
verbose bool
// Factory for creating file nodes (injected from root)
readerFactory ReaderFactory
// Permission store for chmod/chown support
permStore *PermissionStore
}
// Ensure interfaces are implemented
var _ fs.InodeEmbedder = (*MKVFSRoot)(nil)
var _ fs.InodeEmbedder = (*MKVFSNode)(nil)
var _ fs.InodeEmbedder = (*MKVFSDirNode)(nil)
var _ fs.NodeReaddirer = (*MKVFSRoot)(nil)
var _ fs.NodeLookuper = (*MKVFSRoot)(nil)
var _ fs.NodeGetattrer = (*MKVFSRoot)(nil)
var _ fs.NodeReaddirer = (*MKVFSDirNode)(nil)
var _ fs.NodeLookuper = (*MKVFSDirNode)(nil)
var _ fs.NodeGetattrer = (*MKVFSDirNode)(nil)
var _ fs.NodeMkdirer = (*MKVFSDirNode)(nil)
var _ fs.NodeRmdirer = (*MKVFSDirNode)(nil)
var _ fs.NodeUnlinker = (*MKVFSDirNode)(nil)
var _ fs.NodeCreater = (*MKVFSDirNode)(nil)
var _ fs.NodeOpener = (*MKVFSNode)(nil)
var _ fs.NodeReader = (*MKVFSNode)(nil)
var _ fs.NodeGetattrer = (*MKVFSNode)(nil)
var _ fs.NodeSetattrer = (*MKVFSNode)(nil)
var _ fs.NodeSetattrer = (*MKVFSDirNode)(nil)
// getFilePerms returns file permissions from the store, or defaults if store is nil.
func getFilePerms(store *PermissionStore, path string) (uid, gid, mode uint32) {
if store != nil {
return store.GetFilePerms(path)
}
return 0, 0, 0444
}
// getDirPerms returns directory permissions from the store, or defaults if store is nil.
func getDirPerms(store *PermissionStore, path string) (uid, gid, mode uint32) {
if store != nil {
return store.GetDirPerms(path)
}
return 0, 0, 0555
}
package fuse
import (
"context"
"log"
"sort"
"syscall"
"time"
"github.com/hanwen/go-fuse/v2/fs"
"github.com/hanwen/go-fuse/v2/fuse"
)
// --- MKVFSDirNode interface implementations ---
// Readdir implements fs.NodeReaddirer - lists files and subdirectories.
func (d *MKVFSDirNode) Readdir(ctx context.Context) (fs.DirStream, syscall.Errno) {
// Permission checks are handled by the kernel via default_permissions mount option.
return d.readdirInternal(ctx)
}
// readdirInternal performs the directory listing. It does not perform any permission
// checks itself (those are handled by the kernel via default_permissions) and is
// shared by both MKVFSRoot.Readdir and MKVFSDirNode.Readdir.
func (d *MKVFSDirNode) readdirInternal(ctx context.Context) (fs.DirStream, syscall.Errno) {
d.mu.RLock()
defer d.mu.RUnlock()
if d.verbose {
log.Printf("Readdir: %s (files=%d, subdirs=%d)", d.path, len(d.files), len(d.subdirs))
}
entries := make([]fuse.DirEntry, 0, len(d.files)+len(d.subdirs))
// Collect and sort subdirectory names for deterministic ordering
subdirNames := make([]string, 0, len(d.subdirs))
for name := range d.subdirs {
subdirNames = append(subdirNames, name)
}
sort.Strings(subdirNames)
// Add subdirectories first (sorted)
for _, name := range subdirNames {
if d.verbose {
log.Printf("Readdir: adding subdir %s", name)
}
entries = append(entries, fuse.DirEntry{
Name: name,
Mode: fuse.S_IFDIR,
})
}
// Collect and sort file names for deterministic ordering
fileNames := make([]string, 0, len(d.files))
for name := range d.files {
fileNames = append(fileNames, name)
}
sort.Strings(fileNames)
// Add files (sorted)
for _, name := range fileNames {
if d.verbose {
log.Printf("Readdir: adding file %s", name)
}
entries = append(entries, fuse.DirEntry{
Name: name,
Mode: fuse.S_IFREG,
})
}
return fs.NewListDirStream(entries), 0
}
// Lookup implements fs.NodeLookuper - looks up a file or subdirectory by name.
func (d *MKVFSDirNode) Lookup(ctx context.Context, name string, out *fuse.EntryOut) (*fs.Inode, syscall.Errno) {
// Permission checks are handled by the kernel via default_permissions mount option.
d.mu.RLock()
defer d.mu.RUnlock()
// Check subdirectories first
if subdir, ok := d.subdirs[name]; ok {
if d.verbose {
log.Printf("Lookup: found subdir %s in %s", name, d.path)
}
// Lock subdir to safely access its fields
subdir.mu.RLock()
subdirCount := len(subdir.subdirs)
subdir.mu.RUnlock()
uid, gid, mode := getDirPerms(d.permStore, subdir.path)
now := time.Now()
out.Mode = fuse.S_IFDIR | mode
out.Uid = uid
out.Gid = gid
out.Atime = uint64(now.Unix())
out.Mtime = uint64(now.Unix())
out.Ctime = uint64(now.Unix())
out.Nlink = 2 + uint32(subdirCount)
stable := fs.StableAttr{
Mode: fuse.S_IFDIR,
Ino: hashString(subdir.path),
}
child := d.NewPersistentInode(ctx, subdir, stable)
return child, 0
}
// Check files
if file, ok := d.files[name]; ok {
if d.verbose {
log.Printf("Lookup: found file %s in %s (size=%d)", name, d.path, file.Size)
}
var filePath string
if d.path == "" {
filePath = name
} else {
filePath = d.path + "/" + name
}
uid, gid, mode := getFilePerms(d.permStore, filePath)
now := time.Now()
out.Size = uint64(file.Size)
out.Mode = fuse.S_IFREG | mode
out.Uid = uid
out.Gid = gid
out.Atime = uint64(now.Unix())
out.Mtime = uint64(now.Unix())
out.Ctime = uint64(now.Unix())
out.Nlink = 1
node := &MKVFSNode{file: file, path: filePath, verbose: d.verbose, permStore: d.permStore}
stable := fs.StableAttr{
Mode: fuse.S_IFREG,
Ino: hashString(filePath),
}
child := d.NewInode(ctx, node, stable)
return child, 0
}
if d.verbose {
log.Printf("Lookup: not found %s in %s", name, d.path)
}
return nil, syscall.ENOENT
}
// Getattr implements fs.NodeGetattrer - returns directory attributes.
func (d *MKVFSDirNode) Getattr(ctx context.Context, fh fs.FileHandle, out *fuse.AttrOut) syscall.Errno {
d.mu.RLock()
defer d.mu.RUnlock()
now := time.Now()
uid, gid, mode := getDirPerms(d.permStore, d.path)
out.Mode = fuse.S_IFDIR | mode
out.Uid = uid
out.Gid = gid
out.Atime = uint64(now.Unix())
out.Mtime = uint64(now.Unix())
out.Ctime = uint64(now.Unix())
out.Nlink = 2 + uint32(len(d.subdirs))
return 0
}
// Setattr implements fs.NodeSetattrer - handles chmod/chown on directories.
func (d *MKVFSDirNode) Setattr(ctx context.Context, fh fs.FileHandle, in *fuse.SetAttrIn, out *fuse.AttrOut) syscall.Errno {
if d.permStore == nil {
// No permission store - can't change permissions
return syscall.EROFS
}
// Only UID, GID, and mode changes are supported. All other setattr operations
// (e.g. size truncation, atime/mtime updates) must fail on this read-only FS.
supportedMask := uint32(fuse.FATTR_UID | fuse.FATTR_GID | fuse.FATTR_MODE)
if in.Valid&^supportedMask != 0 {
return syscall.EROFS
}
// Get current permissions and caller
dirUID, dirGID, dirMode := getDirPerms(d.permStore, d.path)
caller, ok := GetCaller(ctx)
if !ok {
return syscall.EACCES
}
var newUID, newGID, newMode *uint32
// Check which fields are being changed
if in.Valid&fuse.FATTR_UID != 0 {
newUID = &in.Uid
}
if in.Valid&fuse.FATTR_GID != 0 {
newGID = &in.Gid
}
if in.Valid&fuse.FATTR_MODE != 0 {
mode := in.Mode & 0777 // Only permission bits
newMode = &mode
}
// Normalize no-op changes to nil to avoid unnecessary disk writes
if newUID != nil && *newUID == dirUID {
newUID = nil
}
if newGID != nil && *newGID == dirGID {
newGID = nil
}
if newMode != nil && *newMode == dirMode {
newMode = nil
}
// Permission checks for chown
if newUID != nil || newGID != nil {
if errno := CheckChown(caller, dirUID, dirGID, newUID, newGID); errno != 0 {
if d.verbose {
log.Printf("Setattr: chown permission denied for %s (caller uid=%d)", d.path, caller.Uid)
}
return errno
}
}
// Permission checks for chmod
if newMode != nil {
if errno := CheckChmod(caller, dirUID); errno != 0 {
if d.verbose {
log.Printf("Setattr: chmod permission denied for %s (caller uid=%d)", d.path, caller.Uid)
}
return errno
}
}
// Update permission store
if err := d.permStore.SetDirPerms(d.path, newUID, newGID, newMode); err != nil {
if d.verbose {
log.Printf("Setattr error: %s: %v", d.path, err)
}
return syscall.EIO
}
if d.verbose {
log.Printf("Setattr: %s uid=%v gid=%v mode=%v", d.path, newUID, newGID, newMode)
}
// Return updated attributes
return d.Getattr(ctx, fh, out)
}
// --- Read-only filesystem error handlers ---
// These return EROFS (Read-only file system) for write operations.
// Mkdir implements fs.NodeMkdirer - rejects directory creation.
func (d *MKVFSDirNode) Mkdir(ctx context.Context, name string, mode uint32, out *fuse.EntryOut) (*fs.Inode, syscall.Errno) {
if d.verbose {
log.Printf("Mkdir: rejected (read-only) %s in %s", name, d.path)
}
return nil, syscall.EROFS
}
// Rmdir implements fs.NodeRmdirer - rejects directory removal.
func (d *MKVFSDirNode) Rmdir(ctx context.Context, name string) syscall.Errno {
if d.verbose {
log.Printf("Rmdir: rejected (read-only) %s in %s", name, d.path)
}
return syscall.EROFS
}
// Unlink implements fs.NodeUnlinker - rejects file deletion.
func (d *MKVFSDirNode) Unlink(ctx context.Context, name string) syscall.Errno {
if d.verbose {
log.Printf("Unlink: rejected (read-only) %s in %s", name, d.path)
}
return syscall.EROFS
}
// Create implements fs.NodeCreater - rejects file creation.
func (d *MKVFSDirNode) Create(ctx context.Context, name string, flags uint32, mode uint32, out *fuse.EntryOut) (node *fs.Inode, fh fs.FileHandle, fuseFlags uint32, errno syscall.Errno) {
if d.verbose {
log.Printf("Create: rejected (read-only) %s in %s", name, d.path)
}
return nil, nil, 0, syscall.EROFS
}
package fuse
import (
"fmt"
"log"
"path/filepath"
"sync"
"github.com/stuckj/mkvdup/internal/dedup"
)
// MKVFSOptions contains options for creating an MKVFS filesystem.
type MKVFSOptions struct {
Verbose bool
PermissionsPath string
// Defaults holds the default permissions to use when a PermissionStore is configured.
// If nil, DefaultPerms() is used. Set to a non-nil value to use specific defaults.
// Note: explicit-zero defaults only work when provided programmatically here;
// they are not persisted to or loaded from the permissions YAML file.
Defaults *Defaults
}
// NewMKVFS creates a new MKVFS root from a list of config files.
// Config files are resolved recursively (includes and virtual_files are expanded).
// Set verbose=true to enable debug logging.
func NewMKVFS(configPaths []string, verbose bool) (*MKVFSRoot, error) {
configs, _, _, err := dedup.ResolveConfigs(configPaths)
if err != nil {
return nil, fmt.Errorf("resolve configs: %w", err)
}
return NewMKVFSFromConfigs(configs, verbose, &DefaultReaderFactory{}, nil)
}
// NewMKVFSWithPermissions creates a new MKVFS root with a permission store.
// Config files are resolved recursively (includes and virtual_files are expanded).
func NewMKVFSWithPermissions(configPaths []string, verbose bool, permStore *PermissionStore) (*MKVFSRoot, error) {
configs, _, _, err := dedup.ResolveConfigs(configPaths)
if err != nil {
return nil, fmt.Errorf("resolve configs: %w", err)
}
return NewMKVFSFromConfigs(configs, verbose, &DefaultReaderFactory{}, permStore)
}
// NewMKVFSWithOptions creates a new MKVFS root with the given options.
// Config files are resolved recursively (includes and virtual_files are expanded).
func NewMKVFSWithOptions(configPaths []string, opts MKVFSOptions) (*MKVFSRoot, error) {
var permStore *PermissionStore
if opts.PermissionsPath != "" {
defaults := DefaultPerms()
if opts.Defaults != nil {
defaults = *opts.Defaults
}
permStore = NewPermissionStore(opts.PermissionsPath, defaults, opts.Verbose)
if err := permStore.Load(); err != nil {
return nil, fmt.Errorf("load permissions: %w", err)
}
}
configs, _, _, err := dedup.ResolveConfigs(configPaths)
if err != nil {
return nil, fmt.Errorf("resolve configs: %w", err)
}
return NewMKVFSFromConfigs(configs, opts.Verbose, &DefaultReaderFactory{}, permStore)
}
// NewMKVFSWithFactories creates a new MKVFS root with custom factories.
// This allows injecting mock implementations for testing.
func NewMKVFSWithFactories(configPaths []string, verbose bool, readerFactory ReaderFactory, configReader ConfigReader, permStore *PermissionStore) (*MKVFSRoot, error) {
root := &MKVFSRoot{
files: make(map[string]*MKVFile),
verbose: verbose,
readerFactory: readerFactory,
configReader: configReader,
permStore: permStore,
}
if verbose {
log.Printf("Creating MKVFS with %d config files", len(configPaths))
}
for _, configPath := range configPaths {
if verbose {
log.Printf("Reading config: %s", configPath)
}
config, err := root.configReader.ReadConfig(configPath)
if err != nil {
return nil, fmt.Errorf("read config %s: %w", configPath, err)
}
if verbose {
log.Printf("Config: name=%s, dedup=%s, source=%s", config.Name, config.DedupFile, config.SourceDir)
}
// Resolve relative paths
configDir := filepath.Dir(configPath)
dedupPath := config.DedupFile
if !filepath.IsAbs(dedupPath) {
dedupPath = filepath.Join(configDir, dedupPath)
}
sourceDir := config.SourceDir
if !filepath.IsAbs(sourceDir) {
sourceDir = filepath.Join(configDir, sourceDir)
}
// Open dedup file to get size (lazy loading - only reads header)
if verbose {
log.Printf("Opening dedup file: %s", dedupPath)
}
reader, err := root.readerFactory.NewReaderLazy(dedupPath, sourceDir)
if err != nil {
if verbose {
log.Printf("Failed to open dedup file: %v", err)
}
return nil, fmt.Errorf("open dedup file %s: %w", dedupPath, err)
}
mkvFile := &MKVFile{
Name: config.Name,
DedupPath: dedupPath,
SourceDir: sourceDir,
Size: reader.OriginalSize(),
readerFactory: root.readerFactory,
}
// Don't keep reader open - we'll open it lazily
reader.Close()
root.files[config.Name] = mkvFile
if verbose {
log.Printf("Added file: %s (size=%d)", config.Name, mkvFile.Size)
}
}
if verbose {
log.Printf("Total files: %d", len(root.files))
}
// Build directory tree from collected files
fileList := make([]*MKVFile, 0, len(root.files))
for _, f := range root.files {
fileList = append(fileList, f)
}
root.rootDir = BuildDirectoryTree(fileList, verbose, readerFactory, permStore)
// Clean up stale permission entries if we have a permission store
if permStore != nil {
validFiles, validDirs := root.collectValidPaths()
removed := permStore.CleanupStale(validFiles, validDirs)
if removed > 0 {
if verbose {
log.Printf("Cleaned up %d stale permission entries", removed)
}
if err := permStore.Save(); err != nil {
log.Printf("Warning: failed to save permissions after cleanup: %v", err)
}
}
}
if verbose {
log.Printf("Directory tree built with %d root entries", len(root.rootDir.files)+len(root.rootDir.subdirs))
}
return root, nil
}
// maxParallelReaders limits concurrent dedup header reads to avoid
// exhausting file descriptors when mounting thousands of files.
const maxParallelReaders = 64
// readConfigHeaders reads dedup file headers in parallel with concurrency
// bounded by maxParallelReaders. It returns a slice of MKVFile (indexed by
// config position) and the first error encountered. On error, no partial
// results are returned and the slice is nil.
func readConfigHeaders(configs []dedup.Config, readerFactory ReaderFactory, verbose bool) ([]*MKVFile, error) {
results := make([]*MKVFile, len(configs))
// For small counts, read sequentially to avoid goroutine overhead
if len(configs) <= 4 {
for i, config := range configs {
if verbose {
log.Printf("Opening dedup file: %s", config.DedupFile)
}
reader, err := readerFactory.NewReaderLazy(config.DedupFile, config.SourceDir)
if err != nil {
return nil, fmt.Errorf("open dedup file %s: %w", config.DedupFile, err)
}
results[i] = &MKVFile{
Name: config.Name,
DedupPath: config.DedupFile,
SourceDir: config.SourceDir,
Size: reader.OriginalSize(),
readerFactory: readerFactory,
}
reader.Close()
}
return results, nil
}
var (
wg sync.WaitGroup
errMu sync.Mutex
first error
)
// Fixed-size worker pool pulling jobs from a channel.
numWorkers := maxParallelReaders
if len(configs) < numWorkers {
numWorkers = len(configs)
}
jobs := make(chan int)
wg.Add(numWorkers)
for range numWorkers {
go func() {
defer wg.Done()
for idx := range jobs {
// Skip work if another worker already failed,
// but keep draining jobs to avoid deadlocking the sender.
errMu.Lock()
failed := first != nil
errMu.Unlock()
if failed {
continue
}
cfg := configs[idx]
reader, err := readerFactory.NewReaderLazy(cfg.DedupFile, cfg.SourceDir)
if err != nil {
errMu.Lock()
if first == nil {
first = fmt.Errorf("open dedup file %s: %w", cfg.DedupFile, err)
}
errMu.Unlock()
continue
}
results[idx] = &MKVFile{
Name: cfg.Name,
DedupPath: cfg.DedupFile,
SourceDir: cfg.SourceDir,
Size: reader.OriginalSize(),
readerFactory: readerFactory,
}
reader.Close()
}
}()
}
for i := range configs {
jobs <- i
}
close(jobs)
wg.Wait()
if first != nil {
return nil, first
}
return results, nil
}
// NewMKVFSFromConfigs creates a new MKVFS root from already-resolved configs.
// Paths in configs must already be absolute (as returned by dedup.ResolveConfigs).
// Dedup file headers are read in parallel for faster startup with many files.
func NewMKVFSFromConfigs(configs []dedup.Config, verbose bool, readerFactory ReaderFactory, permStore *PermissionStore) (*MKVFSRoot, error) {
root := &MKVFSRoot{
files: make(map[string]*MKVFile),
verbose: verbose,
readerFactory: readerFactory,
permStore: permStore,
}
if verbose {
log.Printf("Creating MKVFS with %d resolved configs", len(configs))
}
mkvFiles, err := readConfigHeaders(configs, readerFactory, verbose)
if err != nil {
return nil, err
}
for _, mkvFile := range mkvFiles {
if mkvFile == nil {
continue
}
root.files[mkvFile.Name] = mkvFile
if verbose {
log.Printf("Added file: %s (size=%d)", mkvFile.Name, mkvFile.Size)
}
}
if verbose {
log.Printf("Total files: %d", len(root.files))
}
// Build directory tree from collected files
fileList := make([]*MKVFile, 0, len(root.files))
for _, f := range root.files {
fileList = append(fileList, f)
}
root.rootDir = BuildDirectoryTree(fileList, verbose, readerFactory, permStore)
// Clean up stale permission entries if we have a permission store
if permStore != nil {
validFiles, validDirs := root.collectValidPaths()
removed := permStore.CleanupStale(validFiles, validDirs)
if removed > 0 {
if verbose {
log.Printf("Cleaned up %d stale permission entries", removed)
}
if err := permStore.Save(); err != nil {
log.Printf("Warning: failed to save permissions after cleanup: %v", err)
}
}
}
if verbose {
log.Printf("Directory tree built with %d root entries", len(root.rootDir.files)+len(root.rootDir.subdirs))
}
return root, nil
}
package fuse
import (
"context"
"fmt"
"log"
"syscall"
"time"
"github.com/hanwen/go-fuse/v2/fs"
"github.com/hanwen/go-fuse/v2/fuse"
)
// Getattr implements fs.NodeGetattrer - returns file attributes.
func (n *MKVFSNode) Getattr(ctx context.Context, fh fs.FileHandle, out *fuse.AttrOut) syscall.Errno {
now := time.Now()
out.Size = uint64(n.file.Size)
uid, gid, mode := getFilePerms(n.permStore, n.path)
out.Mode = fuse.S_IFREG | mode
out.Uid = uid
out.Gid = gid
out.Atime = uint64(now.Unix())
out.Mtime = uint64(now.Unix())
out.Ctime = uint64(now.Unix())
out.Nlink = 1
return 0
}
// Setattr implements fs.NodeSetattrer - handles chmod/chown on files.
func (n *MKVFSNode) Setattr(ctx context.Context, fh fs.FileHandle, in *fuse.SetAttrIn, out *fuse.AttrOut) syscall.Errno {
if n.permStore == nil {
// No permission store - can't change permissions
return syscall.EROFS
}
// Only UID, GID, and mode changes are supported. All other setattr operations
// (e.g. size truncation, atime/mtime updates) must fail on this read-only FS.
supportedMask := uint32(fuse.FATTR_UID | fuse.FATTR_GID | fuse.FATTR_MODE)
if in.Valid&^supportedMask != 0 {
return syscall.EROFS
}
// Get current permissions and caller
fileUID, fileGID, fileMode := getFilePerms(n.permStore, n.path)
caller, ok := GetCaller(ctx)
if !ok {
return syscall.EACCES
}
var newUID, newGID, newMode *uint32
// Check which fields are being changed
if in.Valid&fuse.FATTR_UID != 0 {
newUID = &in.Uid
}
if in.Valid&fuse.FATTR_GID != 0 {
newGID = &in.Gid
}
if in.Valid&fuse.FATTR_MODE != 0 {
mode := in.Mode & 0777 // Only permission bits
newMode = &mode
}
// Normalize no-op changes to nil to avoid unnecessary disk writes
if newUID != nil && *newUID == fileUID {
newUID = nil
}
if newGID != nil && *newGID == fileGID {
newGID = nil
}
if newMode != nil && *newMode == fileMode {
newMode = nil
}
// Permission checks for chown
if newUID != nil || newGID != nil {
if errno := CheckChown(caller, fileUID, fileGID, newUID, newGID); errno != 0 {
if n.verbose {
log.Printf("Setattr: chown permission denied for %s (caller uid=%d)", n.path, caller.Uid)
}
return errno
}
}
// Permission checks for chmod
if newMode != nil {
if errno := CheckChmod(caller, fileUID); errno != 0 {
if n.verbose {
log.Printf("Setattr: chmod permission denied for %s (caller uid=%d)", n.path, caller.Uid)
}
return errno
}
}
// Update permission store
if err := n.permStore.SetFilePerms(n.path, newUID, newGID, newMode); err != nil {
if n.verbose {
log.Printf("Setattr error: %s: %v", n.path, err)
}
return syscall.EIO
}
if n.verbose {
log.Printf("Setattr: %s uid=%v gid=%v mode=%v", n.path, newUID, newGID, newMode)
}
// Return updated attributes
return n.Getattr(ctx, fh, out)
}
// Open implements fs.NodeOpener - opens a file for reading.
func (n *MKVFSNode) Open(ctx context.Context, flags uint32) (fs.FileHandle, uint32, syscall.Errno) {
// This is a read-only filesystem - reject any write access or operations
// that would modify the filesystem. Note: O_RDONLY|O_APPEND is a valid
// read-only open on Linux (positions at EOF), so we only check access mode.
accMode := flags & syscall.O_ACCMODE
if accMode != syscall.O_RDONLY || flags&(syscall.O_TRUNC|syscall.O_CREAT) != 0 {
return nil, 0, syscall.EROFS
}
// Permission checks are handled by the kernel via default_permissions mount option.
// Check if file was disabled due to source file change
n.file.mu.RLock()
disabled := n.file.disabled
n.file.mu.RUnlock()
if disabled {
if n.verbose {
log.Printf("Open: %s: source file changed, file disabled", n.file.Name)
}
return nil, 0, syscall.EIO
}
if n.verbose {
log.Printf("Open: %s", n.file.Name)
}
// Initialize reader lazily if needed
if err := n.ensureReader(); err != nil {
if n.verbose {
log.Printf("Open error: %s: %v", n.file.Name, err)
}
return nil, 0, syscall.EIO
}
return nil, fuse.FOPEN_KEEP_CACHE | fuse.FOPEN_CACHE_DIR, 0
}
// Read implements fs.NodeReader - reads data from the file.
func (n *MKVFSNode) Read(ctx context.Context, fh fs.FileHandle, dest []byte, off int64) (fuse.ReadResult, syscall.Errno) {
// Permission checks are handled by the kernel via default_permissions mount option.
n.file.mu.RLock()
defer n.file.mu.RUnlock()
if n.file.disabled {
if n.verbose {
log.Printf("Read error: %s: source file changed, file disabled", n.file.Name)
}
return nil, syscall.EIO
}
if n.file.reader == nil {
// Reader not initialized
if n.verbose {
log.Printf("Read error: %s: reader not initialized", n.file.Name)
}
return nil, syscall.EIO
}
// Clamp read to file size
if off >= n.file.Size {
return fuse.ReadResultData(nil), 0
}
endOff := off + int64(len(dest))
if endOff > n.file.Size {
dest = dest[:n.file.Size-off]
}
// Read from dedup reader
nRead, err := n.file.reader.ReadAt(dest, off)
if err != nil && nRead == 0 {
if n.verbose {
log.Printf("Read error: %s at offset %d: %v", n.file.Name, off, err)
}
return nil, syscall.EIO
}
if n.verbose {
log.Printf("Read: %s offset=%d len=%d read=%d", n.file.Name, off, len(dest), nRead)
}
return fuse.ReadResultData(dest[:nRead]), 0
}
// ensureReader ensures the dedup reader is initialized.
func (n *MKVFSNode) ensureReader() error {
n.file.mu.Lock()
defer n.file.mu.Unlock()
if n.file.reader != nil {
return nil
}
// Open dedup file with lazy loading using the factory
reader, err := n.file.readerFactory.NewReaderLazy(n.file.DedupPath, n.file.SourceDir)
if err != nil {
return fmt.Errorf("open dedup file: %w", err)
}
// Initialize the reader for reading (handles ES vs raw internally)
if err := reader.InitializeForReading(n.file.SourceDir); err != nil {
reader.Close()
return fmt.Errorf("initialize reader: %w", err)
}
n.file.reader = reader
return nil
}
// Disable marks the file as disabled (source changed). Subsequent reads
// return EIO. Closes any active reader. Thread-safe.
func (f *MKVFile) Disable() {
f.mu.Lock()
defer f.mu.Unlock()
f.disabled = true
if f.reader != nil {
f.reader.Close()
f.reader = nil
}
}
// Enable re-enables a previously disabled file (e.g., after checksum
// verification confirms the source is OK). The reader will be lazily
// re-initialized on next Open.
func (f *MKVFile) Enable() {
f.mu.Lock()
defer f.mu.Unlock()
f.disabled = false
}
// Close cleans up the file's resources.
func (f *MKVFile) Close() {
f.mu.Lock()
defer f.mu.Unlock()
if f.reader != nil {
f.reader.Close()
f.reader = nil
}
}
// updateFrom copies data fields from src into f. If the underlying dedup file
// changed, any active reader is closed since it's no longer valid.
// The caller must hold f.mu (write lock).
func (f *MKVFile) updateFrom(src *MKVFile) {
// Close reader if the underlying file changed — it's no longer valid
if f.reader != nil && (f.DedupPath != src.DedupPath || f.SourceDir != src.SourceDir) {
f.reader.Close()
f.reader = nil
}
f.Name = src.Name
f.DedupPath = src.DedupPath
f.SourceDir = src.SourceDir
f.Size = src.Size
f.readerFactory = src.readerFactory
// Reset disabled flag — reload re-validates source files
f.disabled = false
}
// hashString creates a stable inode number from a string.
func hashString(s string) uint64 {
var h uint64 = 5381
for _, c := range s {
h = ((h << 5) + h) + uint64(c)
}
return h
}
package fuse
import (
"context"
"fmt"
"log"
"path"
"strings"
"sync"
"syscall"
"time"
"github.com/hanwen/go-fuse/v2/fs"
"github.com/hanwen/go-fuse/v2/fuse"
"github.com/stuckj/mkvdup/internal/dedup"
)
// reloadNotification captures a pending FUSE kernel notification to emit
// after all locks are released (go-fuse notifications must not be called
// while holding filesystem locks, as the kernel may call back into the FS).
type reloadNotification struct {
parent *fs.Inode
child *fs.Inode // non-nil for deletions (if kernel had cached the inode)
name string
isDelete bool
}
// findParentInode walks the directory tree to find the parent inode for a
// given file path (e.g., "Movies/Action/film.mkv"). Returns the parent's
// go-fuse Inode and the basename, or (nil, "") if the parent directory
// doesn't exist in the tree.
//
// For root-level files (no directory component), returns r.Inode.
// Caller must NOT hold directory locks — this method acquires them.
func (r *MKVFSRoot) findParentInode(filePath string) (*fs.Inode, string) {
cleaned := path.Clean(filePath)
parts := strings.Split(cleaned, "/")
// Filter empty parts (handles leading slashes)
valid := make([]string, 0, len(parts))
for _, p := range parts {
if p != "" && p != "." {
valid = append(valid, p)
}
}
if len(valid) == 0 {
return nil, ""
}
basename := valid[len(valid)-1]
dirParts := valid[:len(valid)-1]
if len(dirParts) == 0 {
// File is at root level — parent is the root inode
return &r.Inode, basename
}
// Walk directory tree to find parent
current := r.rootDir
for _, part := range dirParts {
current.mu.RLock()
subdir, ok := current.subdirs[part]
current.mu.RUnlock()
if !ok {
return nil, ""
}
current = subdir
}
// Newly created directories from mergeDirectoryTree have uninitialized
// fs.Inode (never registered with go-fuse via NewPersistentInode).
// The kernel doesn't know about them, so notifications would panic.
// Return nil — the kernel will discover the directory via Lookup.
if current.Inode.StableAttr().Ino == 0 {
return nil, ""
}
return ¤t.Inode, basename
}
// markAncestorDirs walks from inode up to (and including) the root,
// adding each ancestor to changedDirs so their readdir caches are
// invalidated. This is necessary because a file addition or removal
// in a deeply nested virtual directory may cause intermediate
// directories to be created or removed by the tree merge.
func markAncestorDirs(inode *fs.Inode, changedDirs map[*fs.Inode]bool) {
for node := inode; ; {
_, ancestor := node.Parent()
if ancestor == nil {
break
}
if changedDirs[ancestor] {
break // already marked — ancestors above must be too
}
changedDirs[ancestor] = true
node = ancestor
}
}
// Reload updates the filesystem with new configs. It updates existing MKVFile
// objects in place to preserve pointer identity for cached FUSE inodes, and
// merges the directory tree structure (required because go-fuse caches
// persistent inode objects by inode number).
//
// After the merge, FUSE kernel notifications are emitted:
// - NotifyDelete for removed files (sends IN_DELETE to inotify watchers)
// - NotifyEntry for added files (invalidates kernel dentry cache)
// - NotifyContent on changed directories (invalidates readdir cache)
//
// Note: The FUSE protocol has no NOTIFY_CREATE, so added files don't
// generate proactive inotify events. Media servers should use periodic
// scanning in addition to inotify watching.
//
// Semantics:
// - New files become immediately visible
// - Removed files disappear from listings
// - Modified mappings update existing MKVFile objects in place; active readers
// are closed if the underlying dedup path changed (re-opened lazily on next read)
// - Permissions are reloaded from disk and stale entries cleaned up
// (cleanup is skipped if permission reload fails, to avoid overwriting
// a temporarily unreadable permissions file)
func (r *MKVFSRoot) Reload(configs []dedup.Config, logFn func(string, ...interface{})) error {
if logFn == nil {
logFn = func(string, ...interface{}) {}
}
// Build new file set from configs (parallel header reads with soft failure)
newFiles := make(map[string]*MKVFile)
type reloadResult struct {
file *MKVFile
err error
}
results := make([]reloadResult, len(configs))
if len(configs) <= 4 {
// Sequential for small counts
for i, config := range configs {
reader, err := r.readerFactory.NewReaderLazy(config.DedupFile, config.SourceDir)
if err != nil {
results[i] = reloadResult{err: fmt.Errorf("open dedup file %s: %w", config.DedupFile, err)}
continue
}
results[i] = reloadResult{file: &MKVFile{
Name: config.Name,
DedupPath: config.DedupFile,
SourceDir: config.SourceDir,
Size: reader.OriginalSize(),
readerFactory: r.readerFactory,
}}
reader.Close()
}
} else {
// Fixed-size worker pool to bound goroutine count and open file concurrency.
numWorkers := maxParallelReaders
if len(configs) < numWorkers {
numWorkers = len(configs)
}
jobs := make(chan int)
var wg sync.WaitGroup
wg.Add(numWorkers)
for range numWorkers {
go func() {
defer wg.Done()
for idx := range jobs {
cfg := configs[idx]
reader, err := r.readerFactory.NewReaderLazy(cfg.DedupFile, cfg.SourceDir)
if err != nil {
results[idx] = reloadResult{err: fmt.Errorf("open dedup file %s: %w", cfg.DedupFile, err)}
continue
}
results[idx] = reloadResult{file: &MKVFile{
Name: cfg.Name,
DedupPath: cfg.DedupFile,
SourceDir: cfg.SourceDir,
Size: reader.OriginalSize(),
readerFactory: r.readerFactory,
}}
reader.Close()
}
}()
}
for i := range configs {
jobs <- i
}
close(jobs)
wg.Wait()
}
for i, res := range results {
if res.err != nil {
logFn("warning: skipping %s: %v", configs[i].Name, res.err)
continue
}
if existing, ok := newFiles[res.file.Name]; ok {
logFn("warning: duplicate name %q (dedup: %s replaced by %s)", res.file.Name, existing.DedupPath, res.file.DedupPath)
}
newFiles[res.file.Name] = res.file
}
// Snapshot old file names for change detection
r.mu.RLock()
oldFileNames := make(map[string]bool, len(r.files))
for name := range r.files {
oldFileNames[name] = true
}
r.mu.RUnlock()
// Before merge: capture child inodes for files being removed. We need
// these for NotifyDelete (sends IN_DELETE inotify event), and the child
// inode won't be reachable after tree merge removes it. We do NOT
// capture parent inodes here because the merge may delete parent
// directories, leaving stale inode pointers that crash go-fuse.
deletedChildren := make(map[string]*fs.Inode) // filePath → child inode
for name := range oldFileNames {
if _, inNew := newFiles[name]; !inNew {
parentInode, basename := r.findParentInode(name)
if parentInode != nil {
if child := parentInode.GetChild(basename); child != nil {
deletedChildren[name] = child
}
}
}
}
// Build new directory tree
fileList := make([]*MKVFile, 0, len(newFiles))
for _, f := range newFiles {
fileList = append(fileList, f)
}
newTree := BuildDirectoryTree(fileList, r.verbose, r.readerFactory, r.permStore)
// Update flat files map in place (preserves pointer identity for cached inodes)
r.mu.Lock()
for name := range r.files {
if _, inNew := newFiles[name]; !inNew {
delete(r.files, name)
}
}
for name, newFile := range newFiles {
if existingFile, ok := r.files[name]; ok {
existingFile.mu.Lock()
existingFile.updateFrom(newFile)
existingFile.mu.Unlock()
} else {
r.files[name] = newFile
}
}
r.mu.Unlock()
// Merge new tree into existing tree in place
mergeDirectoryTree(r.rootDir, newTree)
// After merge: capture all notifications using the post-merge tree.
// Parent inodes are now resolved against the live tree, so we never
// reference deleted directory inodes. If a parent directory was removed
// by the merge, findParentInode returns nil and we skip the notification
// — the directory removal already invalidates its children in the kernel.
var notifications []reloadNotification
changedDirs := make(map[*fs.Inode]bool)
for name := range oldFileNames {
if _, inNew := newFiles[name]; !inNew {
parentInode, basename := r.findParentInode(name)
if parentInode != nil {
notifications = append(notifications, reloadNotification{
parent: parentInode,
child: deletedChildren[name],
name: basename,
isDelete: true,
})
changedDirs[parentInode] = true
markAncestorDirs(parentInode, changedDirs)
}
}
}
for name := range newFiles {
if !oldFileNames[name] {
parentInode, basename := r.findParentInode(name)
if parentInode != nil {
notifications = append(notifications, reloadNotification{
parent: parentInode,
name: basename,
isDelete: false,
})
changedDirs[parentInode] = true
markAncestorDirs(parentInode, changedDirs)
}
}
}
// Reload permissions and clean up stale entries
if r.permStore != nil {
if err := r.permStore.Load(); err != nil {
logFn("warning: failed to reload permissions: %v", err)
} else {
validFiles, validDirs := r.collectValidPaths()
removed := r.permStore.CleanupStale(validFiles, validDirs)
if removed > 0 {
logFn("cleaned up %d stale permission entries", removed)
if err := r.permStore.Save(); err != nil {
logFn("warning: failed to save permissions after cleanup: %v", err)
}
}
}
}
logFn("reload complete: %d files", len(newFiles))
// Emit FUSE kernel notifications. Must be called after all filesystem
// locks are released — go-fuse may call back into the FS during
// notification processing, which would deadlock if locks were held.
r.emitReloadNotifications(notifications, changedDirs, logFn)
return nil
}
// Files returns a snapshot of the current file set. Used by SourceWatcher
// to build reverse mappings from source files to virtual files. Returns a
// defensive copy to avoid data races with concurrent Reload() calls.
func (r *MKVFSRoot) Files() map[string]*MKVFile {
r.mu.RLock()
defer r.mu.RUnlock()
out := make(map[string]*MKVFile, len(r.files))
for k, v := range r.files {
out[k] = v
}
return out
}
// SetMounted marks the filesystem as mounted, enabling FUSE kernel
// notifications during config reload. Must be called after fs.Mount()
// succeeds.
func (r *MKVFSRoot) SetMounted() {
r.mounted.Store(true)
}
// emitReloadNotifications sends FUSE kernel notifications for files that
// were added or removed during a config reload.
func (r *MKVFSRoot) emitReloadNotifications(notifications []reloadNotification, changedDirs map[*fs.Inode]bool, logFn func(string, ...interface{})) {
if len(notifications) == 0 || !r.mounted.Load() {
return
}
var deleted, invalidated int
for _, n := range notifications {
if n.isDelete {
if n.child != nil {
// NotifyDelete sends a real IN_DELETE inotify event
if errno := n.parent.NotifyDelete(n.name, n.child); errno == 0 {
deleted++
}
} else {
// Child inode was never cached by kernel — just invalidate entry
if errno := n.parent.NotifyEntry(n.name); errno == 0 {
invalidated++
}
}
} else {
// NotifyEntry invalidates the kernel's dentry cache so the
// new file is visible on next lookup/readdir.
if errno := n.parent.NotifyEntry(n.name); errno == 0 {
invalidated++
}
}
}
// Invalidate readdir cache for all directories that had changes.
// Skip uninitialized inodes (Ino==0) as a safety net — these should
// not appear here after the findParentInode fix, but guard anyway.
for dirInode := range changedDirs {
if dirInode.StableAttr().Ino != 0 {
dirInode.NotifyContent(0, 0)
}
}
if deleted > 0 || invalidated > 0 {
logFn("kernel notifications: %d deleted, %d invalidated, %d dirs", deleted, invalidated, len(changedDirs))
}
}
// collectValidPaths returns maps of all valid file and directory paths.
func (r *MKVFSRoot) collectValidPaths() (files, dirs map[string]bool) {
files = make(map[string]bool)
dirs = make(map[string]bool)
if r.rootDir == nil {
return files, dirs
}
r.collectPathsRecursive(r.rootDir, files, dirs)
return files, dirs
}
func (r *MKVFSRoot) collectPathsRecursive(node *MKVFSDirNode, files, dirs map[string]bool) {
node.mu.RLock()
defer node.mu.RUnlock()
// Add this directory (including root with empty path)
dirs[node.path] = true
// Add files
for name := range node.files {
var filePath string
if node.path == "" {
filePath = name
} else {
filePath = node.path + "/" + name
}
files[filePath] = true
}
// Recurse into subdirectories
for _, subdir := range node.subdirs {
r.collectPathsRecursive(subdir, files, dirs)
}
}
// Getattr implements fs.NodeGetattrer - returns attributes for the root directory.
// This ensures the root directory uses permissions from the permission store,
// consistent with all subdirectories.
func (r *MKVFSRoot) Getattr(ctx context.Context, fh fs.FileHandle, out *fuse.AttrOut) syscall.Errno {
now := time.Now()
uid, gid, mode := getDirPerms(r.permStore, "")
out.Mode = fuse.S_IFDIR | mode
out.Uid = uid
out.Gid = gid
out.Atime = uint64(now.Unix())
out.Mtime = uint64(now.Unix())
out.Ctime = uint64(now.Unix())
out.Nlink = 2
if r.rootDir != nil {
r.rootDir.mu.RLock()
out.Nlink += uint32(len(r.rootDir.subdirs))
r.rootDir.mu.RUnlock()
}
return 0
}
// Readdir implements fs.NodeReaddirer - lists files in the root directory.
// Delegates to the directory tree for hierarchical listing.
func (r *MKVFSRoot) Readdir(ctx context.Context) (fs.DirStream, syscall.Errno) {
// Permission checks are handled by the kernel via default_permissions mount option.
// This properly checks supplementary groups and matches real filesystem behavior.
if r.rootDir != nil {
return r.rootDir.readdirInternal(ctx)
}
// Fallback to flat listing if no directory tree (shouldn't happen)
r.mu.RLock()
defer r.mu.RUnlock()
if r.verbose {
log.Printf("Readdir: listing %d files (flat)", len(r.files))
}
entries := make([]fuse.DirEntry, 0, len(r.files))
for name := range r.files {
if r.verbose {
log.Printf("Readdir: adding %s", name)
}
entries = append(entries, fuse.DirEntry{
Name: name,
Mode: fuse.S_IFREG,
})
}
return fs.NewListDirStream(entries), 0
}
// Lookup implements fs.NodeLookuper - looks up a file or directory by name.
// Uses the directory tree for hierarchical lookup.
func (r *MKVFSRoot) Lookup(ctx context.Context, name string, out *fuse.EntryOut) (*fs.Inode, syscall.Errno) {
// Permission checks are handled by the kernel via default_permissions mount option.
if r.rootDir != nil {
r.rootDir.mu.RLock()
defer r.rootDir.mu.RUnlock()
// Check subdirectories first
if subdir, ok := r.rootDir.subdirs[name]; ok {
if r.verbose {
log.Printf("Lookup: found subdir %s at root", name)
}
// Lock subdir to safely access its fields
subdir.mu.RLock()
subdirCount := len(subdir.subdirs)
subdir.mu.RUnlock()
uid, gid, mode := getDirPerms(r.permStore, subdir.path)
now := time.Now()
out.Mode = fuse.S_IFDIR | mode
out.Uid = uid
out.Gid = gid
out.Atime = uint64(now.Unix())
out.Mtime = uint64(now.Unix())
out.Ctime = uint64(now.Unix())
out.Nlink = 2 + uint32(subdirCount)
stable := fs.StableAttr{
Mode: fuse.S_IFDIR,
Ino: hashString(subdir.path),
}
child := r.NewPersistentInode(ctx, subdir, stable)
return child, 0
}
// Check files
if file, ok := r.rootDir.files[name]; ok {
if r.verbose {
log.Printf("Lookup: found file %s at root (size=%d)", name, file.Size)
}
uid, gid, mode := getFilePerms(r.permStore, name)
now := time.Now()
out.Size = uint64(file.Size)
out.Mode = fuse.S_IFREG | mode
out.Uid = uid
out.Gid = gid
out.Atime = uint64(now.Unix())
out.Mtime = uint64(now.Unix())
out.Ctime = uint64(now.Unix())
out.Nlink = 1
node := &MKVFSNode{file: file, path: name, verbose: r.verbose, permStore: r.permStore}
stable := fs.StableAttr{
Mode: fuse.S_IFREG,
Ino: hashString(name),
}
child := r.NewInode(ctx, node, stable)
return child, 0
}
if r.verbose {
log.Printf("Lookup: not found %s at root", name)
}
return nil, syscall.ENOENT
}
// Fallback to flat lookup if no directory tree (shouldn't happen)
r.mu.RLock()
file, ok := r.files[name]
r.mu.RUnlock()
if !ok {
if r.verbose {
log.Printf("Lookup: file not found: %s", name)
}
return nil, syscall.ENOENT
}
if r.verbose {
log.Printf("Lookup: %s (size=%d)", name, file.Size)
}
uid, gid, mode := getFilePerms(r.permStore, name)
// Create a new file node
node := &MKVFSNode{file: file, path: name, verbose: r.verbose, permStore: r.permStore}
// Set attributes
now := time.Now()
out.Size = uint64(file.Size)
out.Mode = fuse.S_IFREG | mode
out.Uid = uid
out.Gid = gid
out.Atime = uint64(now.Unix())
out.Mtime = uint64(now.Unix())
out.Ctime = uint64(now.Unix())
// Create inode with stable ID based on filename
stable := fs.StableAttr{
Mode: fuse.S_IFREG,
Ino: hashString(name),
}
child := r.NewInode(ctx, node, stable)
return child, 0
}
package fuse
import (
"context"
"fmt"
"os/exec"
"strings"
"sync"
"time"
"al.essio.dev/pkg/shellescape"
"github.com/stuckj/mkvdup/internal/dedup"
)
// ErrorEvent describes a source integrity issue detected by the watcher.
type ErrorEvent struct {
SourcePath string // absolute path of the changed source file
AffectedFiles []string // virtual file names affected
Event string // "changed", "missing", "size_changed", "checksum_mismatch", "read_error", "checksum_queue_full"
}
// ErrorNotifier batches integrity error events and executes an external
// command with placeholder substitution. Events are collected for a
// configurable batch interval; when the interval expires, the command
// is executed once with all accumulated events.
type ErrorNotifier struct {
config dedup.ErrorCommandConfig
logFn func(string, ...interface{})
mu sync.Mutex
pending []ErrorEvent
timer *time.Timer
stopped bool
}
// NewErrorNotifier creates a notifier from the given config.
func NewErrorNotifier(config dedup.ErrorCommandConfig, logFn func(string, ...interface{})) *ErrorNotifier {
if logFn == nil {
logFn = func(string, ...interface{}) {}
}
return &ErrorNotifier{
config: config,
logFn: logFn,
}
}
// Notify adds an error event to the batch. If this is the first event in
// the batch, a timer is started. Subsequent events reset the timer so that
// rapid bursts are coalesced into a single command execution.
func (n *ErrorNotifier) Notify(event ErrorEvent) {
n.mu.Lock()
defer n.mu.Unlock()
if n.stopped {
return
}
n.pending = append(n.pending, event)
// Start or reset the debounce timer.
if n.timer == nil {
n.timer = time.AfterFunc(n.config.BatchInterval, n.flush)
} else {
n.timer.Reset(n.config.BatchInterval)
}
}
// Stop flushes any pending events and prevents future notifications.
func (n *ErrorNotifier) Stop() {
n.mu.Lock()
n.stopped = true
if n.timer != nil {
n.timer.Stop()
n.timer = nil
}
events := n.pending
n.pending = nil
n.mu.Unlock()
if len(events) > 0 {
n.executeCommand(events)
}
}
// flush is called when the debounce timer fires.
func (n *ErrorNotifier) flush() {
n.mu.Lock()
if n.stopped {
n.mu.Unlock()
return
}
events := n.pending
n.pending = nil
n.timer = nil
n.mu.Unlock()
if len(events) > 0 {
n.executeCommand(events)
}
}
// executeCommand runs the configured external command with placeholders
// substituted from the batched events. The command runs with a timeout
// and its output is logged on failure.
func (n *ErrorNotifier) executeCommand(events []ErrorEvent) {
if len(n.config.Command.Args) == 0 {
n.logFn("source-watch: on_error_command: no command configured, skipping")
return
}
ctx, cancel := context.WithTimeout(context.Background(), n.config.Timeout)
defer cancel()
var cmd *exec.Cmd
if n.config.Command.IsShell {
// String form: run via sh -c with shell-escaped placeholder values
cmdStr := substitutePlaceholders(n.config.Command.Args[0], events, true)
cmd = exec.CommandContext(ctx, "sh", "-c", cmdStr)
} else {
// List form: substitute placeholders in each argument (no escaping needed)
args := make([]string, len(n.config.Command.Args))
for i, arg := range n.config.Command.Args {
args[i] = substitutePlaceholders(arg, events, false)
}
cmd = exec.CommandContext(ctx, args[0], args[1:]...)
}
output, err := cmd.CombinedOutput()
if err != nil {
n.logFn("source-watch: on_error_command failed: %v (output: %s)", err, strings.TrimSpace(string(output)))
}
}
// substitutePlaceholders replaces %source%, %files%, and %event% in s
// with values derived from the batched events. When shellEscape is true,
// placeholder values are shell-escaped for safe use in sh -c commands.
func substitutePlaceholders(s string, events []ErrorEvent, shellEscape bool) string {
// Build source list (newline-separated, deduplicated)
sourceSet := make(map[string]bool)
var sources []string
for _, e := range events {
if !sourceSet[e.SourcePath] {
sourceSet[e.SourcePath] = true
sources = append(sources, e.SourcePath)
}
}
// Build file list (comma-separated, deduplicated)
fileSet := make(map[string]bool)
var files []string
for _, e := range events {
for _, f := range e.AffectedFiles {
if !fileSet[f] {
fileSet[f] = true
files = append(files, f)
}
}
}
// Build event list
var eventStrs []string
if len(events) == 1 {
eventStrs = append(eventStrs, events[0].Event)
} else {
for _, e := range events {
eventStrs = append(eventStrs, fmt.Sprintf("%s: %s", e.SourcePath, e.Event))
}
}
sourceVal := strings.Join(sources, "\n")
filesVal := strings.Join(files, ", ")
eventVal := strings.Join(eventStrs, "\n")
if shellEscape {
sourceVal = shellescape.Quote(sourceVal)
filesVal = shellescape.Quote(filesVal)
eventVal = shellescape.Quote(eventVal)
}
s = strings.ReplaceAll(s, "%source%", sourceVal)
s = strings.ReplaceAll(s, "%files%", filesVal)
s = strings.ReplaceAll(s, "%event%", eventVal)
return s
}
// Package fuse provides a FUSE filesystem for accessing deduplicated MKV files.
package fuse
import (
"context"
"fmt"
"log"
"os"
"os/user"
"path/filepath"
"strconv"
"sync"
"syscall"
"github.com/hanwen/go-fuse/v2/fuse"
"gopkg.in/yaml.v3"
)
// Perms holds uid, gid, and mode for a file or directory.
// Nil values indicate the field should inherit from defaults.
type Perms struct {
UID *uint32 `yaml:"uid,omitempty"`
GID *uint32 `yaml:"gid,omitempty"`
Mode *uint32 `yaml:"mode,omitempty"`
}
// Defaults holds default permissions for files and directories.
type Defaults struct {
FileUID uint32 `yaml:"file_uid"`
FileGID uint32 `yaml:"file_gid"`
FileMode uint32 `yaml:"file_mode"`
DirUID uint32 `yaml:"dir_uid"`
DirGID uint32 `yaml:"dir_gid"`
DirMode uint32 `yaml:"dir_mode"`
}
// DefaultPerms returns the default permission values.
func DefaultPerms() Defaults {
return Defaults{
FileUID: 0,
FileGID: 0,
FileMode: 0444,
DirUID: 0,
DirGID: 0,
DirMode: 0555,
}
}
// permissionsFile is the structure of the permissions YAML file.
type permissionsFile struct {
Defaults Defaults `yaml:"defaults"`
Files map[string]*Perms `yaml:"files,omitempty"`
Directories map[string]*Perms `yaml:"directories,omitempty"`
}
// PermissionStore manages file/directory permissions with persistence.
type PermissionStore struct {
path string
defaults Defaults
files map[string]*Perms
dirs map[string]*Perms
mu sync.RWMutex
verbose bool
}
// NewPermissionStore creates a new permission store.
// If path is empty, permissions will not be persisted.
func NewPermissionStore(path string, defaults Defaults, verbose bool) *PermissionStore {
return &PermissionStore{
path: path,
defaults: defaults,
files: make(map[string]*Perms),
dirs: make(map[string]*Perms),
verbose: verbose,
}
}
// Load loads permissions from the file.
// If the file doesn't exist, the store remains empty (using defaults).
func (s *PermissionStore) Load() error {
if s.path == "" {
return nil
}
data, err := os.ReadFile(s.path)
if err != nil {
if os.IsNotExist(err) {
if s.verbose {
log.Printf("Permissions file %s does not exist, using defaults", s.path)
}
return nil
}
return fmt.Errorf("read permissions file: %w", err)
}
var pf permissionsFile
if err := yaml.Unmarshal(data, &pf); err != nil {
return fmt.Errorf("parse permissions file: %w", err)
}
s.mu.Lock()
defer s.mu.Unlock()
// Override defaults if specified in file
if pf.Defaults.FileMode != 0 || pf.Defaults.FileUID != 0 || pf.Defaults.FileGID != 0 ||
pf.Defaults.DirMode != 0 || pf.Defaults.DirUID != 0 || pf.Defaults.DirGID != 0 {
// Only override non-zero values from file
if pf.Defaults.FileMode != 0 {
s.defaults.FileMode = pf.Defaults.FileMode
}
if pf.Defaults.FileUID != 0 {
s.defaults.FileUID = pf.Defaults.FileUID
}
if pf.Defaults.FileGID != 0 {
s.defaults.FileGID = pf.Defaults.FileGID
}
if pf.Defaults.DirMode != 0 {
s.defaults.DirMode = pf.Defaults.DirMode
}
if pf.Defaults.DirUID != 0 {
s.defaults.DirUID = pf.Defaults.DirUID
}
if pf.Defaults.DirGID != 0 {
s.defaults.DirGID = pf.Defaults.DirGID
}
}
if pf.Files != nil {
s.files = pf.Files
}
if pf.Directories != nil {
s.dirs = pf.Directories
}
if s.verbose {
log.Printf("Loaded permissions: %d files, %d directories", len(s.files), len(s.dirs))
}
return nil
}
// Save saves permissions to the file.
func (s *PermissionStore) Save() error {
if s.path == "" {
return nil
}
s.mu.RLock()
// Deep copy the maps to avoid data races during marshalling.
// We copy both the map and the Perms values to ensure complete isolation.
pf := permissionsFile{
Defaults: s.defaults,
}
if s.files != nil {
pf.Files = make(map[string]*Perms, len(s.files))
for k, v := range s.files {
if v != nil {
permsCopy := *v // copy the Perms struct
pf.Files[k] = &permsCopy
}
}
}
if s.dirs != nil {
pf.Directories = make(map[string]*Perms, len(s.dirs))
for k, v := range s.dirs {
if v != nil {
permsCopy := *v // copy the Perms struct
pf.Directories[k] = &permsCopy
}
}
}
s.mu.RUnlock()
// Create parent directory if needed
dir := filepath.Dir(s.path)
if err := os.MkdirAll(dir, 0755); err != nil {
return fmt.Errorf("create permissions directory: %w", err)
}
data, err := yaml.Marshal(&pf)
if err != nil {
return fmt.Errorf("marshal permissions: %w", err)
}
if err := os.WriteFile(s.path, data, 0644); err != nil {
return fmt.Errorf("write permissions file: %w", err)
}
if s.verbose {
log.Printf("Saved permissions to %s", s.path)
}
return nil
}
// GetFilePerms returns the effective permissions for a file.
// Returns uid, gid, mode with defaults applied for any unset values.
func (s *PermissionStore) GetFilePerms(path string) (uid, gid, mode uint32) {
s.mu.RLock()
defer s.mu.RUnlock()
uid = s.defaults.FileUID
gid = s.defaults.FileGID
mode = s.defaults.FileMode
if p, ok := s.files[path]; ok {
if p.UID != nil {
uid = *p.UID
}
if p.GID != nil {
gid = *p.GID
}
if p.Mode != nil {
mode = *p.Mode
}
}
return uid, gid, mode
}
// GetDirPerms returns the effective permissions for a directory.
// Returns uid, gid, mode with defaults applied for any unset values.
func (s *PermissionStore) GetDirPerms(path string) (uid, gid, mode uint32) {
s.mu.RLock()
defer s.mu.RUnlock()
uid = s.defaults.DirUID
gid = s.defaults.DirGID
mode = s.defaults.DirMode
if p, ok := s.dirs[path]; ok {
if p.UID != nil {
uid = *p.UID
}
if p.GID != nil {
gid = *p.GID
}
if p.Mode != nil {
mode = *p.Mode
}
}
return uid, gid, mode
}
// SetFilePerms sets permissions for a file.
// Only non-nil values are updated; nil values leave existing values unchanged.
// Automatically saves to disk.
func (s *PermissionStore) SetFilePerms(path string, uid, gid *uint32, mode *uint32) error {
s.mu.Lock()
// If all values are nil, nothing to do
if uid == nil && gid == nil && mode == nil {
s.mu.Unlock()
return nil
}
p, ok := s.files[path]
if !ok {
p = &Perms{}
s.files[path] = p
}
// Only update non-nil values; copy values so the store owns their lifetime.
if uid != nil {
v := *uid
p.UID = &v
}
if gid != nil {
v := *gid
p.GID = &v
}
if mode != nil {
v := *mode
p.Mode = &v
}
s.mu.Unlock()
if s.verbose {
log.Printf("SetFilePerms: %s uid=%v gid=%v mode=%v", path, uid, gid, mode)
}
return s.Save()
}
// RemoveFilePerms removes all permission overrides for a file.
// The file will use default permissions. Automatically saves to disk.
func (s *PermissionStore) RemoveFilePerms(path string) error {
s.mu.Lock()
delete(s.files, path)
s.mu.Unlock()
if s.verbose {
log.Printf("RemoveFilePerms: %s", path)
}
return s.Save()
}
// SetDirPerms sets permissions for a directory.
// Only non-nil values are updated; nil values leave existing values unchanged.
// Automatically saves to disk.
func (s *PermissionStore) SetDirPerms(path string, uid, gid *uint32, mode *uint32) error {
s.mu.Lock()
// If all values are nil, nothing to do
if uid == nil && gid == nil && mode == nil {
s.mu.Unlock()
return nil
}
p, ok := s.dirs[path]
if !ok {
p = &Perms{}
s.dirs[path] = p
}
// Only update non-nil values; copy values so the store owns their lifetime.
if uid != nil {
v := *uid
p.UID = &v
}
if gid != nil {
v := *gid
p.GID = &v
}
if mode != nil {
v := *mode
p.Mode = &v
}
s.mu.Unlock()
if s.verbose {
log.Printf("SetDirPerms: %s uid=%v gid=%v mode=%v", path, uid, gid, mode)
}
return s.Save()
}
// RemoveDirPerms removes all permission overrides for a directory.
// The directory will use default permissions. Automatically saves to disk.
func (s *PermissionStore) RemoveDirPerms(path string) error {
s.mu.Lock()
delete(s.dirs, path)
s.mu.Unlock()
if s.verbose {
log.Printf("RemoveDirPerms: %s", path)
}
return s.Save()
}
// CleanupStale removes entries for paths that don't exist in the mounted filesystem.
// validFiles and validDirs are maps of valid paths (value is ignored, just checking keys).
// Returns the number of stale entries removed.
func (s *PermissionStore) CleanupStale(validFiles, validDirs map[string]bool) int {
s.mu.Lock()
defer s.mu.Unlock()
removed := 0
// Clean up stale file entries
for path := range s.files {
if !validFiles[path] {
delete(s.files, path)
removed++
if s.verbose {
log.Printf("Removed stale file permission entry: %s", path)
}
}
}
// Clean up stale directory entries
for path := range s.dirs {
if !validDirs[path] {
delete(s.dirs, path)
removed++
if s.verbose {
log.Printf("Removed stale directory permission entry: %s", path)
}
}
}
return removed
}
// Defaults returns the current default permissions.
func (s *PermissionStore) Defaults() Defaults {
s.mu.RLock()
defer s.mu.RUnlock()
return s.defaults
}
// ResolvePermissionsPath determines which permissions file to use.
// Priority:
// 1. explicitPath (from --permissions-file flag)
// 2. ~/.config/mkvdup/permissions.yaml (if exists) - for both root and non-root
// 3. /etc/mkvdup/permissions.yaml (if exists AND running as root)
// 4. Default based on euid: root uses /etc/, non-root uses ~/.config/
//
// Non-root users always get a user-writable path (unless explicitly overridden)
// to avoid EACCES errors when saving permission changes.
func ResolvePermissionsPath(explicitPath string) string {
if explicitPath != "" {
return explicitPath
}
home, err := os.UserHomeDir()
userPath := ""
if err == nil {
userPath = filepath.Join(home, ".config", "mkvdup", "permissions.yaml")
}
// Check user config - takes priority for both root and non-root
if userPath != "" {
if _, err := os.Stat(userPath); err == nil {
return userPath
}
}
systemPath := "/etc/mkvdup/permissions.yaml"
// For root: check system config, then default to system path
if os.Geteuid() == 0 {
if _, err := os.Stat(systemPath); err == nil {
return systemPath
}
return systemPath
}
// For non-root: always use user path to ensure writability.
// Do NOT use system path even if it exists, as non-root users
// typically cannot write to /etc/ and chmod/chown operations
// would fail with EACCES.
if userPath != "" {
return userPath
}
// Fallback if no home directory (unusual for non-root)
return systemPath
}
// CallerInfo represents the calling process's credentials.
type CallerInfo struct {
Uid uint32
Gid uint32
}
// testCallerHook is set by test code to allow injecting caller credentials.
// This is nil in production, ensuring only real FUSE contexts are trusted.
var testCallerHook func(context.Context) (CallerInfo, bool)
// GetCaller extracts caller credentials from the FUSE context.
// Returns (caller, true) if credentials are available, (zero, false) otherwise.
// Callers should deny access when ok is false to fail closed.
func GetCaller(ctx context.Context) (CallerInfo, bool) {
if caller, ok := fuse.FromContext(ctx); ok {
return CallerInfo{Uid: caller.Uid, Gid: caller.Gid}, true
}
// Check for test-injected caller (only available in tests)
if testCallerHook != nil {
if caller, ok := testCallerHook(ctx); ok {
return caller, true
}
}
// Fail closed: return zero value and false to indicate no credentials
return CallerInfo{}, false
}
// IsRoot returns true if the caller is root (uid 0).
func (c CallerInfo) IsRoot() bool {
return c.Uid == 0
}
// CheckChown verifies the caller can change file ownership.
// Returns 0 if allowed, syscall.EPERM if denied.
// Only root can change UID. Only root or file owner can change GID.
// Non-root owners can change GID to any group they are a member of
// (primary or supplementary). No-op changes (newUID == fileUID or
// newGID == fileGID) are always allowed.
func CheckChown(caller CallerInfo, fileUID, fileGID uint32, newUID, newGID *uint32) syscall.Errno {
// Only root can change UID to a different user
if newUID != nil && *newUID != fileUID && !caller.IsRoot() {
return syscall.EPERM
}
// GID changes:
// - No-op (nil or same as current) is always allowed
// - Root can change to any GID
// - Non-root owner can change to any group they belong to
if newGID != nil && *newGID != fileGID {
if caller.IsRoot() {
return 0
}
// Non-root: must be owner AND must be a member of target group
if caller.Uid != fileUID || !isGroupMember(caller.Uid, caller.Gid, *newGID) {
return syscall.EPERM
}
}
return 0
}
// groupMembershipFunc is the function used to check group membership.
// It can be overridden in tests to avoid OS-level lookups.
var groupMembershipFunc = defaultGroupMembership
// isGroupMember checks if a user is a member of the given group.
// This checks the primary GID and supplementary groups.
func isGroupMember(uid, primaryGID, targetGID uint32) bool {
return groupMembershipFunc(uid, primaryGID, targetGID)
}
// defaultGroupMembership checks group membership by looking up the user's
// groups from the OS.
func defaultGroupMembership(uid, primaryGID, targetGID uint32) bool {
// Primary GID is always a member
if targetGID == primaryGID {
return true
}
// Look up supplementary groups from the OS
u, err := user.LookupId(strconv.FormatUint(uint64(uid), 10))
if err != nil {
return false
}
groupIDs, err := u.GroupIds()
if err != nil {
return false
}
targetStr := strconv.FormatUint(uint64(targetGID), 10)
for _, gid := range groupIDs {
if gid == targetStr {
return true
}
}
return false
}
// CheckChmod verifies the caller can change file mode.
// Returns 0 if allowed, syscall.EPERM if denied.
// Only root or file owner can chmod.
func CheckChmod(caller CallerInfo, fileUID uint32) syscall.Errno {
if caller.IsRoot() || caller.Uid == fileUID {
return 0
}
return syscall.EPERM
}
package fuse
import (
"log"
"path"
"strings"
)
// BuildDirectoryTree creates a directory tree from files with path-containing names.
// Directories are auto-created for each path component.
// Files with names like "Movies/Action/film.mkv" will create the directory hierarchy.
//
// Path handling:
// - Leading slashes are stripped (absolute paths become relative)
// - Paths are cleaned (e.g., "foo//bar" becomes "foo/bar")
// - Only forward slashes (/) are treated as path separators
// - Paths containing ".." components are rejected
// - Empty filenames are rejected
//
// Conflicts:
// - Duplicate paths: later file wins, warning logged
// - File/directory collision: directory wins, file skipped with warning
func BuildDirectoryTree(files []*MKVFile, verbose bool, readerFactory ReaderFactory, permStore *PermissionStore) *MKVFSDirNode {
root := &MKVFSDirNode{
name: "",
path: "",
files: make(map[string]*MKVFile),
subdirs: make(map[string]*MKVFSDirNode),
verbose: verbose,
readerFactory: readerFactory,
permStore: permStore,
}
for _, file := range files {
insertFile(root, file, verbose, readerFactory, permStore)
}
return root
}
// insertFile inserts a file into the directory tree, creating directories as needed.
func insertFile(root *MKVFSDirNode, file *MKVFile, verbose bool, readerFactory ReaderFactory, permStore *PermissionStore) {
// Validate: reject paths with ".." components (security)
if strings.Contains(file.Name, "..") {
log.Printf("Warning: skipping file with invalid path (contains '..'): %s", file.Name)
return
}
// Clean and split the path
cleanPath := path.Clean(file.Name)
parts := strings.Split(cleanPath, "/")
// Filter out empty parts (handles leading slashes and multiple slashes)
validParts := make([]string, 0, len(parts))
for _, p := range parts {
if p != "" && p != "." {
validParts = append(validParts, p)
}
}
// Validate: reject empty filenames
if len(validParts) == 0 {
log.Printf("Warning: skipping file with empty name: %q", file.Name)
return
}
fileName := validParts[len(validParts)-1]
if fileName == "" {
log.Printf("Warning: skipping file with empty filename: %q", file.Name)
return
}
// Navigate/create directories for each path component except the last (filename)
current := root
for i := 0; i < len(validParts)-1; i++ {
dirName := validParts[i]
current.mu.Lock()
// Check for file/directory collision: if a file exists with this name, skip
if _, fileExists := current.files[dirName]; fileExists {
log.Printf("Warning: path component %q conflicts with existing file, skipping: %s", dirName, file.Name)
current.mu.Unlock()
return
}
subdir, exists := current.subdirs[dirName]
if !exists {
// Create new directory node
var newPath string
if current.path == "" {
newPath = dirName
} else {
newPath = current.path + "/" + dirName
}
subdir = &MKVFSDirNode{
name: dirName,
path: newPath,
files: make(map[string]*MKVFile),
subdirs: make(map[string]*MKVFSDirNode),
verbose: verbose,
readerFactory: readerFactory,
permStore: permStore,
}
current.subdirs[dirName] = subdir
}
current.mu.Unlock()
current = subdir
}
// Insert the file into the final directory
current.mu.Lock()
defer current.mu.Unlock()
// Check for file/directory collision: if a directory exists with this name, skip the file
if _, dirExists := current.subdirs[fileName]; dirExists {
log.Printf("Warning: file %q conflicts with existing directory, skipping", file.Name)
return
}
// Check for duplicate: warn if overwriting
if existing, exists := current.files[fileName]; exists {
log.Printf("Warning: duplicate path %q, replacing %s with %s", file.Name, existing.DedupPath, file.DedupPath)
}
current.files[fileName] = file
}
// mergeDirectoryTree merges newTree's contents into existing's maps in place.
// This is necessary because go-fuse caches persistent inode objects by inode
// number — swapping the root directory won't affect already-cached inodes.
// Instead, we update existing MKVFSDirNode objects' files and subdirs maps
// so cached inodes see the new data.
func mergeDirectoryTree(existing, newTree *MKVFSDirNode) {
existing.mu.Lock()
defer existing.mu.Unlock()
// Remove files that are no longer present
for name := range existing.files {
if _, inNew := newTree.files[name]; !inNew {
delete(existing.files, name)
}
}
// Add or update files (update in place to preserve pointer identity for cached inodes)
for name, newFile := range newTree.files {
if existingFile, ok := existing.files[name]; ok {
existingFile.mu.Lock()
existingFile.updateFrom(newFile)
existingFile.mu.Unlock()
} else {
existing.files[name] = newFile
}
}
// Remove subdirectories that are no longer present
for name := range existing.subdirs {
if _, inNew := newTree.subdirs[name]; !inNew {
delete(existing.subdirs, name)
}
}
// Add or recursively merge subdirectories
for name, newSubdir := range newTree.subdirs {
existingSubdir, exists := existing.subdirs[name]
if !exists {
existing.subdirs[name] = newSubdir
} else {
mergeDirectoryTree(existingSubdir, newSubdir)
}
}
}
package fuse
import (
"fmt"
"io"
"os"
"path/filepath"
"strings"
"sync"
"time"
"github.com/cespare/xxhash/v2"
"github.com/fsnotify/fsnotify"
"github.com/stuckj/mkvdup/internal/dedup"
)
// Default poll interval for network filesystems where inotify doesn't work.
const defaultPollInterval = 60 * time.Second
// checksumRequest is a queued checksum verification job.
type checksumRequest struct {
absPath string
expectedChecksum uint64
expectedSize int64
affected []*MKVFile
gen uint64 // generation stamp; stale requests are skipped
}
// SourceWatcher monitors source files for changes and takes action when
// modifications are detected. It uses inotify for local filesystems and
// falls back to polling for network filesystems (NFS, CIFS/SMB).
type SourceWatcher struct {
watcher *fsnotify.Watcher
// reverse maps absolute source file paths to the virtual files that use them.
reverse map[string][]*MKVFile
// checksums maps absolute source file paths to expected xxhash values.
checksums map[string]uint64
// sizes maps absolute source file paths to expected file sizes.
sizes map[string]int64
// pollFiles maps absolute source file paths to their last known mtime
// for directories that use polling instead of inotify.
pollFiles map[string]time.Time
action string // "warn", "disable", "checksum"
logFn func(string, ...interface{})
mu sync.RWMutex
// checksumCh queues checksum verification requests so they run
// sequentially in a single worker goroutine, avoiding I/O storms
// when many source files change at once.
checksumCh chan checksumRequest
// checksumPending tracks source paths with a queued checksum request,
// preventing duplicate queue entries for the same file. The worker
// clears the flag when it starts processing, so new events that arrive
// during verification are still queued.
checksumPending map[string]bool
// updateGen is incremented on each Update() call. Checksum requests
// carry the generation they were created in; the worker skips requests
// whose generation doesn't match, preventing stale verifications from
// a previous config from disabling files after a reload.
updateGen uint64
pollInterval time.Duration // interval for network FS polling (0 = defaultPollInterval)
notifier *ErrorNotifier // optional external command notifier
stopCh chan struct{}
wg sync.WaitGroup
}
// NewSourceWatcher creates a new source file watcher with the given action.
// If pollInterval <= 0, defaultPollInterval is used.
// If onErrorCommand is non-nil, an ErrorNotifier is created to execute the
// configured command when integrity issues are detected.
// The watcher is not started until Start() is called.
func NewSourceWatcher(action string, pollInterval time.Duration, onErrorCommand *dedup.ErrorCommandConfig, logFn func(string, ...interface{})) (*SourceWatcher, error) {
watcher, err := fsnotify.NewWatcher()
if err != nil {
return nil, fmt.Errorf("create fsnotify watcher: %w", err)
}
if logFn == nil {
logFn = func(string, ...interface{}) {}
}
if pollInterval <= 0 {
pollInterval = defaultPollInterval
}
var notifier *ErrorNotifier
if onErrorCommand != nil {
notifier = NewErrorNotifier(*onErrorCommand, logFn)
}
return &SourceWatcher{
watcher: watcher,
reverse: make(map[string][]*MKVFile),
checksums: make(map[string]uint64),
sizes: make(map[string]int64),
pollFiles: make(map[string]time.Time),
action: action,
logFn: logFn,
checksumCh: make(chan checksumRequest, 256),
checksumPending: make(map[string]bool),
pollInterval: pollInterval,
notifier: notifier,
stopCh: make(chan struct{}),
}, nil
}
// Update rebuilds the watcher's source file mappings from the current file set.
// It removes old watches and sets up new ones. Called on mount and after reload.
//
// For each MKVFile, the readerFactory is used to read the dedup file header
// (lazy read, no full initialization) to get the source file list.
//
// The method minimizes lock hold time: maps are built without the lock,
// swapped in briefly under the lock, and then inotify watches and os.Stat
// calls happen without the lock.
func (sw *SourceWatcher) Update(files map[string]*MKVFile, readerFactory ReaderFactory) {
// Phase 1: Build new maps without holding the lock. This involves
// I/O (reading dedup headers) that should not block event handling.
newReverse := make(map[string][]*MKVFile)
newChecksums := make(map[string]uint64)
newSizes := make(map[string]int64)
watchDirs := make(map[string]bool)
for _, file := range files {
reader, err := readerFactory.NewReaderLazy(file.DedupPath, file.SourceDir)
if err != nil {
sw.logFn("source-watch: warning: cannot read dedup header for %s: %v", file.Name, err)
continue
}
sourceFiles := reader.SourceFileInfo()
reader.Close()
cleanSourceDir := filepath.Clean(file.SourceDir)
if cleanSourceDir[len(cleanSourceDir)-1] != filepath.Separator {
cleanSourceDir += string(filepath.Separator)
}
for _, sf := range sourceFiles {
absPath := filepath.Clean(filepath.Join(file.SourceDir, sf.RelativePath))
if !strings.HasPrefix(absPath, cleanSourceDir) {
sw.logFn("source-watch: warning: skipping source file with path traversal: %s", sf.RelativePath)
continue
}
newReverse[absPath] = append(newReverse[absPath], file)
newChecksums[absPath] = sf.Checksum
newSizes[absPath] = sf.Size
watchDirs[filepath.Dir(absPath)] = true
}
}
// Phase 2: Swap maps and drain stale checksum queue under the lock.
sw.mu.Lock()
oldDirs := sw.watchedDirs()
// Drain any stale checksum requests from a previous configuration.
drain:
for {
select {
case <-sw.checksumCh:
default:
break drain
}
}
sw.checksumPending = make(map[string]bool)
sw.updateGen++
sw.reverse = newReverse
sw.checksums = newChecksums
sw.sizes = newSizes
sw.pollFiles = make(map[string]time.Time)
sw.mu.Unlock()
// Phase 3: Update inotify watches without the lock.
// fsnotify.Watcher methods are thread-safe.
for dir := range oldDirs {
sw.watcher.Remove(dir)
}
// Precompute files per directory so polling setup is O(files), not O(dirs×files).
pathsByDir := make(map[string][]string)
for absPath := range newReverse {
dir := filepath.Dir(absPath)
pathsByDir[dir] = append(pathsByDir[dir], absPath)
}
newPollFiles := make(map[string]time.Time)
for dir := range watchDirs {
if isNetworkFS(dir) {
sw.logFn("source-watch: %s is on a network filesystem, using polling", dir)
for _, absPath := range pathsByDir[dir] {
if info, err := os.Stat(absPath); err == nil {
newPollFiles[absPath] = info.ModTime()
} else {
// File currently missing/unavailable — use zero mtime so
// pollCheck detects it appearing (or triggers handleChange
// via its stat-error path).
newPollFiles[absPath] = time.Time{}
}
}
} else {
if err := sw.watcher.Add(dir); err != nil {
sw.logFn("source-watch: warning: cannot watch %s: %v", dir, err)
}
}
}
// Phase 4: Set poll files under the lock.
if len(newPollFiles) > 0 {
sw.mu.Lock()
sw.pollFiles = newPollFiles
sw.mu.Unlock()
}
sw.logFn("source-watch: monitoring %d source files in %d directories (action=%s)",
len(newReverse), len(watchDirs), sw.action)
}
// watchedDirs returns the set of currently watched directories.
func (sw *SourceWatcher) watchedDirs() map[string]bool {
dirs := make(map[string]bool)
for path := range sw.reverse {
dirs[filepath.Dir(path)] = true
}
return dirs
}
// Start begins the event processing loop. Must be called after Update().
func (sw *SourceWatcher) Start() {
sw.wg.Add(1)
go sw.eventLoop()
// Start checksum worker (single goroutine processes queue sequentially)
if sw.action == "checksum" {
sw.wg.Add(1)
go sw.checksumWorker()
}
// Always start poller — it no-ops when pollFiles is empty, but must
// be running so that network FS dirs added via Update() after reload
// are polled without requiring a restart.
sw.wg.Add(1)
go sw.pollLoop()
}
// Stop stops the watcher and waits for goroutines to exit.
// If a notifier is configured, it is stopped (flushing any pending events).
func (sw *SourceWatcher) Stop() {
close(sw.stopCh)
sw.watcher.Close()
sw.wg.Wait()
if sw.notifier != nil {
sw.notifier.Stop()
}
}
// notify sends an error event to the notifier, if configured.
func (sw *SourceWatcher) notify(sourcePath, event string, names []string) {
if sw.notifier != nil {
sw.notifier.Notify(ErrorEvent{
SourcePath: sourcePath,
AffectedFiles: names,
Event: event,
})
}
}
// eventLoop processes fsnotify events.
func (sw *SourceWatcher) eventLoop() {
defer sw.wg.Done()
for {
select {
case event, ok := <-sw.watcher.Events:
if !ok {
return
}
// React to writes, creates (overwrites), renames, and removals
if event.Op&(fsnotify.Write|fsnotify.Create|fsnotify.Rename|fsnotify.Remove) == 0 {
continue
}
sw.handleChange(event.Name)
case err, ok := <-sw.watcher.Errors:
if !ok {
return
}
sw.logFn("source-watch: watcher error: %v", err)
case <-sw.stopCh:
return
}
}
}
// pollLoop periodically checks files on network filesystems for changes.
func (sw *SourceWatcher) pollLoop() {
defer sw.wg.Done()
ticker := time.NewTicker(sw.pollInterval)
defer ticker.Stop()
for {
select {
case <-ticker.C:
sw.pollCheck()
case <-sw.stopCh:
return
}
}
}
// pollCheck stats all poll-monitored files and triggers handleChange for
// any that have a different mtime than recorded. It snapshots the poll set
// under a read lock, performs os.Stat calls without the lock (network FS
// stats can block), then updates mtimes and processes changes.
func (sw *SourceWatcher) pollCheck() {
// Snapshot under read lock so os.Stat doesn't block event handling.
type polledFile struct {
path string
lastMtime time.Time
}
sw.mu.RLock()
snapshot := make([]polledFile, 0, len(sw.pollFiles))
for absPath, lastMtime := range sw.pollFiles {
snapshot = append(snapshot, polledFile{path: absPath, lastMtime: lastMtime})
}
sw.mu.RUnlock()
// Stat without holding the lock.
type mtimeUpdate struct {
path string
newMtime time.Time
}
var (
updates []mtimeUpdate
changedPaths []string
)
for _, pf := range snapshot {
info, err := os.Stat(pf.path)
if err != nil {
sw.logFn("source-watch: poll: cannot stat %s: %v", pf.path, err)
changedPaths = append(changedPaths, pf.path)
continue
}
if !info.ModTime().Equal(pf.lastMtime) {
updates = append(updates, mtimeUpdate{path: pf.path, newMtime: info.ModTime()})
changedPaths = append(changedPaths, pf.path)
}
}
// Update stored mtimes under the lock.
if len(updates) > 0 {
sw.mu.Lock()
for _, u := range updates {
if _, ok := sw.pollFiles[u.path]; ok {
sw.pollFiles[u.path] = u.newMtime
}
}
sw.mu.Unlock()
}
// Process changes — handleChange acquires the lock per-path.
for _, absPath := range changedPaths {
sw.handleChange(absPath)
}
}
// handleChange processes a source file change event.
func (sw *SourceWatcher) handleChange(absPath string) {
sw.mu.Lock()
defer sw.mu.Unlock()
sw.handleChangeLocked(absPath)
}
// handleChangeLocked processes a source file change. Caller must hold sw.mu.
func (sw *SourceWatcher) handleChangeLocked(absPath string) {
affected, ok := sw.reverse[absPath]
if !ok {
return // Not a tracked source file
}
names := make([]string, len(affected))
for i, f := range affected {
names[i] = f.Name
}
switch sw.action {
case "warn":
sw.logFn("source-watch: WARNING: source file changed: %s (affects: %v)", absPath, names)
sw.notify(absPath, "changed", names)
case "disable":
sw.logFn("source-watch: source file changed, disabling: %s (affects: %v)", absPath, names)
for _, f := range affected {
f.Disable()
}
sw.notify(absPath, "changed", names)
case "checksum":
// Stat the source file to distinguish size changes from
// timestamp-only changes (e.g., touch).
info, err := os.Stat(absPath)
if err != nil {
// File disappeared — disable immediately
sw.logFn("source-watch: source file missing, disabling: %s (affects: %v)", absPath, names)
for _, f := range affected {
f.Disable()
}
sw.notify(absPath, "missing", names)
return
}
expectedSize := sw.sizes[absPath]
if info.Size() != expectedSize {
// Size changed — definitely corrupted, disable immediately
sw.logFn("source-watch: source file size changed (%d → %d), disabling: %s (affects: %v)",
expectedSize, info.Size(), absPath, names)
for _, f := range affected {
f.Disable()
}
sw.notify(absPath, "size_changed", names)
return
}
// Size matches — verify checksum in background. File remains
// accessible during verification; only disabled on mismatch.
if sw.checksumPending[absPath] {
return // Already queued
}
sw.logFn("source-watch: source file modified, verifying checksum: %s (affects: %v)", absPath, names)
affectedCopy := make([]*MKVFile, len(affected))
copy(affectedCopy, affected)
select {
case sw.checksumCh <- checksumRequest{
absPath: absPath,
expectedChecksum: sw.checksums[absPath],
expectedSize: expectedSize,
affected: affectedCopy,
gen: sw.updateGen,
}:
sw.checksumPending[absPath] = true
default:
// Queue full — disable as a safety measure
sw.logFn("source-watch: checksum queue full, disabling: %s (affects: %v)", absPath, names)
for _, f := range affected {
f.Disable()
}
sw.notify(absPath, "checksum_queue_full", names)
}
}
}
// checksumWorker processes checksum verification requests sequentially.
// Only one goroutine runs this, ensuring that bulk source changes don't
// spawn hundreds of parallel I/O-heavy hash operations.
func (sw *SourceWatcher) checksumWorker() {
defer sw.wg.Done()
for {
select {
case req := <-sw.checksumCh:
// Clear pending flag so new events for this path get queued.
// This must happen before verification so that changes during
// hashing trigger a fresh verification.
sw.mu.Lock()
delete(sw.checksumPending, req.absPath)
stale := req.gen != sw.updateGen
sw.mu.Unlock()
if stale {
continue // Config was reloaded; skip stale request
}
sw.verifyChecksum(req.absPath, req.expectedChecksum, req.expectedSize, req.affected, req.gen)
case <-sw.stopCh:
return
}
}
}
// verifyChecksum re-hashes a source file in the background. Files remain
// accessible during verification. If the checksum mismatches, affected
// virtual files are disabled (recoverable via SIGHUP reload or a
// subsequent successful checksum). The gen parameter is checked before
// disabling or enabling so that a reload during verification prevents
// stale results from affecting files in the new configuration.
func (sw *SourceWatcher) verifyChecksum(absPath string, expectedChecksum uint64, expectedSize int64, affected []*MKVFile, gen uint64) {
names := make([]string, len(affected))
for i, f := range affected {
names[i] = f.Name
}
// disableIfCurrent disables affected files only if the watcher
// generation hasn't changed (i.e., no reload occurred during verification).
disableIfCurrent := func() {
sw.mu.RLock()
stale := gen != sw.updateGen
sw.mu.RUnlock()
if stale {
sw.logFn("source-watch: checksum: skipping disable for %s (config reloaded during verification)", absPath)
return
}
for _, f := range affected {
f.Disable()
}
}
// Re-check size — it may have changed since the event was queued
info, err := os.Stat(absPath)
if err != nil {
sw.logFn("source-watch: checksum: cannot stat %s: %v — disabling %v", absPath, err, names)
disableIfCurrent()
sw.notify(absPath, "missing", names)
return
}
if info.Size() != expectedSize {
sw.logFn("source-watch: checksum: size changed for %s (%d → %d) — disabling %v",
absPath, expectedSize, info.Size(), names)
disableIfCurrent()
sw.notify(absPath, "size_changed", names)
return
}
// Full xxhash checksum
f, err := os.Open(absPath)
if err != nil {
sw.logFn("source-watch: checksum: cannot open %s: %v — disabling %v", absPath, err, names)
disableIfCurrent()
sw.notify(absPath, "missing", names)
return
}
defer f.Close()
h := xxhash.New()
buf := make([]byte, 1<<20) // 1MB buffer
for {
// Check for shutdown between reads so large-file hashing
// doesn't block Stop() indefinitely.
select {
case <-sw.stopCh:
return
default:
}
n, readErr := f.Read(buf)
if n > 0 {
h.Write(buf[:n])
}
if readErr != nil {
if readErr != io.EOF {
sw.logFn("source-watch: checksum: read error for %s: %v — disabling %v", absPath, readErr, names)
disableIfCurrent()
sw.notify(absPath, "read_error", names)
return
}
break
}
}
actualChecksum := h.Sum64()
if actualChecksum != expectedChecksum {
sw.logFn("source-watch: checksum mismatch for %s (got %016x, expected %016x) — disabling %v",
absPath, actualChecksum, expectedChecksum, names)
disableIfCurrent()
sw.notify(absPath, "checksum_mismatch", names)
} else {
// Re-enable affected files so transient issues (e.g., network
// glitches) auto-recover without requiring admin SIGHUP.
//
// NOTE: a virtual file can depend on multiple source files. A
// passing checksum for one source could re-enable a file whose
// other source is still bad. This is a known limitation; the
// common case (single source per MKV) is handled correctly, and
// SIGHUP is available as a fallback for multi-source edge cases.
sw.mu.RLock()
stale := gen != sw.updateGen
sw.mu.RUnlock()
if stale {
sw.logFn("source-watch: checksum: skipping re-enable for %s (config reloaded during verification)", absPath)
return
}
sw.logFn("source-watch: checksum verified OK for %s — re-enabling %v", absPath, names)
for _, f := range affected {
f.Enable()
}
}
}
//go:build linux
package fuse
import "golang.org/x/sys/unix"
// Filesystem type constants for network FS detection.
const (
nfsSuperMagic = 0x6969
cifsMagicNum = 0xFF534D42
smb2MagicNum = 0xFE534D42
afsSuper = 0x5346414F
ncpfsSuperMagic = 0x564C
)
// IsNetworkFS checks if the given path is on a network filesystem.
// Exported for integration testing; internal callers use isNetworkFS.
func IsNetworkFS(path string) bool {
return isNetworkFS(path)
}
// isNetworkFS checks if the given path is on a network filesystem.
func isNetworkFS(path string) bool {
var stat unix.Statfs_t
if err := unix.Statfs(path, &stat); err != nil {
// Can't determine — assume local
return false
}
switch stat.Type {
case nfsSuperMagic, cifsMagicNum, smb2MagicNum, afsSuper, ncpfsSuperMagic:
return true
}
return false
}
// Package matcher provides the core deduplication logic for matching MKV packets to source files.
package matcher
import (
"fmt"
"io"
"os"
"runtime"
"strings"
"sync"
"sync/atomic"
"github.com/cespare/xxhash/v2"
"github.com/stuckj/mkvdup/internal/mkv"
"github.com/stuckj/mkvdup/internal/mmap"
"github.com/stuckj/mkvdup/internal/source"
)
const (
// MaxExpansionBytes is the maximum number of bytes to expand a match in each direction.
// Set high to allow matching entire video keyframes which can be several MB.
MaxExpansionBytes = 16 * 1024 * 1024 // 16MB
// localityNearbyCount is the max number of nearby locations to try in Phase 1
// of locality-aware matching before falling back to a full search.
localityNearbyCount = 8
// localityGoodMatchThreshold is the minimum match length (bytes) to accept
// from a nearby location without trying all remaining locations.
// At 4KB (64x the 64-byte window), a false positive is vanishingly unlikely.
localityGoodMatchThreshold = 4096
// phase2MaxVerifyAttempts caps the number of tryVerifyAndExpand calls in
// Phase 2. Common audio patterns (e.g. DTS core headers) can produce the
// same 64-byte hash across hundreds of source files. When none verify,
// the uncapped scan causes I/O thrashing. 64 attempts is >10x the
// observed average for successful Phase 2 searches (~6 locations).
phase2MaxVerifyAttempts = 64
)
// detectNALLengthSize determines the NAL unit length field size from an MKV track's
// codec ID and codec private data. Returns 0 for Annex B (start code) formats,
// or the length field size (1, 2, or 4) for AVCC/HVCC formats.
func detectNALLengthSize(codecID string, codecPrivate []byte) int {
switch codecID {
case "V_MPEG4/ISO/AVC":
// AVCC format: CodecPrivate is AVCDecoderConfigurationRecord
// Byte 4 bits 0-1 = NAL length size - 1
if len(codecPrivate) >= 7 && codecPrivate[0] == 1 {
return int(codecPrivate[4]&0x03) + 1
}
return 4 // Default for AVC if CodecPrivate is missing or malformed
case "V_MPEGH/ISO/HEVC":
// HVCC format: CodecPrivate is HEVCDecoderConfigurationRecord
// Byte 0 = configurationVersion (must be 1)
// Byte 21 bits 6-7 = reserved (must be 111111)
// Byte 21 bits 0-1 = NAL length size - 1
if len(codecPrivate) >= 23 && codecPrivate[0] == 1 {
b := codecPrivate[21]
// Upper 6 bits must be all 1s per ISO/IEC 23008-2
if b&0xFC == 0xFC {
size := int(b&0x03) + 1
// Valid NAL length sizes are 1, 2, or 4 bytes
if size == 1 || size == 2 || size == 4 {
return size
}
}
}
return 4 // Default for HEVC if CodecPrivate is missing or malformed
default:
return 0 // Annex B format (MPEG-2, etc.)
}
}
// NALLengthSizeForTrack returns the NAL length size for a track, suitable for
// use by external callers like ExtractProbeHashes. Returns 0 for Annex B.
func NALLengthSizeForTrack(codecID string, codecPrivate []byte) int {
return detectNALLengthSize(codecID, codecPrivate)
}
// matchedRegion tracks a region that was matched to a source.
type matchedRegion struct {
mkvStart int64
mkvEnd int64
fileIndex uint16
srcOffset int64 // File offset or ES offset depending on source type
isVideo bool // For ES-based sources
audioSubStreamID byte // For audio in MPEG-PS
isLPCM bool // True if this is an LPCM audio region requiring inverse transform
}
// Matcher performs the deduplication matching.
// coverageChunkSize is the granularity for coverage tracking.
// Smaller values give more accurate coverage checks but use more memory.
const coverageChunkSize = 4096 // 4KB chunks
// trackCodecInfo stores per-track codec information for format-aware matching.
type trackCodecInfo struct {
trackType int
nalLengthSize int // 0 = Annex B (start codes), 1/2/4 = AVCC/HVCC (length-prefixed NAL units)
}
// trackCrossPacketHint stores per-track locality state for cross-packet
// handoff. Protected by a mutex to avoid torn reads when multiple
// goroutines process different packets on the same track concurrently.
// Read once at packet start, written once after the last match in a packet.
type trackCrossPacketHint struct {
mu sync.Mutex
valid bool
fileIdx uint16
offset int64 // Midpoint of last matched source region (for Phase 1 hash locality)
srcEnd int64 // End of last matched source region (for locality recovery)
mkvEnd int64 // End of last matched MKV region (for locality recovery)
}
// packetLocality tracks per-packet locality state for deterministic
// intra-packet matching. Updated sequentially by a single goroutine,
// eliminating torn reads from shared state.
type packetLocality struct {
valid bool
fileIdx uint16
offset int64 // Midpoint of last match (for Phase 1)
srcEnd int64 // End of last matched source region
mkvEnd int64 // End of last matched MKV region
}
type Matcher struct {
sourceIndex *source.Index
mkvMmap *mmap.File
mkvData []byte // Zero-copy mmap'd MKV data
mkvSize int64
windowSize int
matchedRegions []matchedRegion
regionsMu sync.Mutex // Protects matchedRegions for concurrent access
trackTypes map[int]int // Map from track number to track type
trackCodecs map[int]trackCodecInfo // Map from track number to codec info
numWorkers int // Number of worker goroutines for parallel matching
verboseWriter io.Writer // Destination for diagnostic output (nil = disabled)
isAVCTrack map[int]bool // Per-track: whether this track uses H.264 NAL types
isPCMTrack map[int]bool // Per-track: whether this track uses PCM audio (A_PCM/*)
isTrueHDTrack map[int]bool // Per-track: whether this track uses TrueHD audio (A_TRUEHD)
// Coverage bitmap for O(1) coverage checks. Each bit represents a chunk.
// A chunk is marked covered when a matched region fully contains it.
coveredChunks []uint64 // Bitmap: bit i = chunk i is covered
coverageMu sync.RWMutex
// Per-track locality hints. Each track gets its own hint so interleaved
// packets from different tracks (e.g. multiple DTS streams) don't thrash
// a single shared hint. Created in Match() before workers start; the map
// itself is read-only during matching, each hint is mutex-synchronized.
trackHints map[uint64]*trackCrossPacketHint
// Diagnostic counters for investigating match failures
diagVideoPacketsTotal atomic.Int64 // Total video packets processed
diagVideoPacketsCoverage atomic.Int64 // Video packets skipped (coverage check)
diagVideoNALsTotal atomic.Int64 // Total video NAL sync points tried
diagVideoNALsTooSmall atomic.Int64 // NALs where window didn't fit
diagVideoNALsHashNotFound atomic.Int64 // NALs where hash wasn't in index
diagVideoNALsVerifyFailed atomic.Int64 // NALs where hash found but all verifications failed
diagVideoNALsAllSkipped atomic.Int64 // NALs where hash found but all locations skipped (e.g. isVideo mismatch)
diagVideoNALsMatched atomic.Int64 // NALs successfully matched
diagVideoNALsMatchedBytes atomic.Int64 // Total bytes from matched video NALs
diagVideoNALsSkippedIsVideo atomic.Int64 // Locations skipped due to isVideo mismatch
// Per-NAL-type diagnostics (H.264 NAL type = first byte & 0x1F)
diagNALTypeNotFound [32]atomic.Int64 // hash not found, by NAL type
diagNALTypeMatched [32]atomic.Int64 // matched, by NAL type
diagNALTypeTotal [32]atomic.Int64 // total attempted, by NAL type
// NAL size bucket diagnostics (video only)
// Buckets: 0=<64, 1=64-127, 2=128-1023, 3=1K-32K, 4=32K+
diagNALSizeMatched [5]atomic.Int64
diagNALSizeUnmatched [5]atomic.Int64
// Phase 2 diagnostics (all track types)
diagPhase2Fallbacks atomic.Int64 // Times Phase 2 full search was triggered
diagPhase2Locations atomic.Int64 // Total locations checked in Phase 2
diagPhase2EarlyExits atomic.Int64 // Times Phase 2 exited early (full-frame match found)
diagPhase2Capped atomic.Int64 // Times Phase 2 hit the verify attempt cap
diagPhase1Skips atomic.Int64 // Times Phase 2 was skipped (Phase 1 sufficient)
diagTotalSyncPoints atomic.Int64 // Total match attempts (all track types)
// Locality recovery diagnostics
diagLocalityAttempts atomic.Int64 // Times locality recovery was attempted
diagLocalityMatched atomic.Int64 // Times locality recovery succeeded
diagLocalityMatchedBytes atomic.Int64 // Total bytes recovered via locality
// First few hash-not-found examples for debugging
diagExamplesMu sync.Mutex
diagExamplesCount int
diagExamplesOutput []string
}
// nalSizeBucket returns the bucket index for a NAL size.
// Buckets: 0=<64, 1=64-127, 2=128-1023, 3=1K-32K, 4=32K+
func nalSizeBucket(size int) int {
switch {
case size < 64:
return 0
case size < 128:
return 1
case size < 1024:
return 2
case size < 32768:
return 3
default:
return 4
}
}
// NewMatcher creates a new Matcher with the given source index.
func NewMatcher(sourceIndex *source.Index) (*Matcher, error) {
numWorkers := runtime.NumCPU() / 2
if numWorkers < 1 {
numWorkers = 1
}
return &Matcher{
sourceIndex: sourceIndex,
windowSize: sourceIndex.WindowSize,
trackTypes: make(map[int]int),
trackCodecs: make(map[int]trackCodecInfo),
isAVCTrack: make(map[int]bool),
isPCMTrack: make(map[int]bool),
isTrueHDTrack: make(map[int]bool),
numWorkers: numWorkers,
}, nil
}
// SetVerboseWriter sets the destination for diagnostic output during matching.
// Pass nil to disable verbose output.
func (m *Matcher) SetVerboseWriter(w io.Writer) {
m.verboseWriter = w
}
// SetNumWorkers sets the number of worker goroutines for parallel matching.
func (m *Matcher) SetNumWorkers(n int) {
if n < 1 {
n = 1
}
m.numWorkers = n
}
// Close releases resources.
func (m *Matcher) Close() error {
if m.mkvMmap != nil {
m.mkvMmap.Close()
}
return nil
}
// ProgressFunc is called to report matching progress.
type ProgressFunc func(processedPackets, totalPackets int)
// Match processes an MKV file and matches packets to the source.
func (m *Matcher) Match(mkvPath string, packets []mkv.Packet, tracks []mkv.Track, progress ProgressFunc) (*Result, error) {
// Memory-map the MKV file for zero-copy access
info, err := os.Stat(mkvPath)
if err != nil {
return nil, fmt.Errorf("stat MKV: %w", err)
}
m.mkvSize = info.Size()
m.mkvMmap, err = mmap.Open(mkvPath)
if err != nil {
return nil, fmt.Errorf("mmap MKV: %w", err)
}
m.mkvData = m.mkvMmap.Data() // Store reference for zero-copy access
// Reset per-run state in case Match() is called multiple times
m.trackTypes = make(map[int]int)
m.trackCodecs = make(map[int]trackCodecInfo)
m.isAVCTrack = make(map[int]bool)
m.isPCMTrack = make(map[int]bool)
m.isTrueHDTrack = make(map[int]bool)
m.diagVideoPacketsTotal.Store(0)
m.diagVideoPacketsCoverage.Store(0)
m.diagVideoNALsTotal.Store(0)
m.diagVideoNALsTooSmall.Store(0)
m.diagVideoNALsHashNotFound.Store(0)
m.diagVideoNALsVerifyFailed.Store(0)
m.diagVideoNALsAllSkipped.Store(0)
m.diagVideoNALsMatched.Store(0)
m.diagVideoNALsMatchedBytes.Store(0)
m.diagVideoNALsSkippedIsVideo.Store(0)
for i := range m.diagNALTypeNotFound {
m.diagNALTypeNotFound[i].Store(0)
m.diagNALTypeMatched[i].Store(0)
m.diagNALTypeTotal[i].Store(0)
}
for i := range m.diagNALSizeMatched {
m.diagNALSizeMatched[i].Store(0)
m.diagNALSizeUnmatched[i].Store(0)
}
m.diagPhase2Fallbacks.Store(0)
m.diagPhase2Locations.Store(0)
m.diagPhase2EarlyExits.Store(0)
m.diagPhase2Capped.Store(0)
m.diagPhase1Skips.Store(0)
m.diagTotalSyncPoints.Store(0)
m.diagExamplesMu.Lock()
m.diagExamplesCount = 0
m.diagExamplesOutput = nil
m.diagExamplesMu.Unlock()
// Initialize per-track locality hints so each track has its own hint.
// Zero value of trackCrossPacketHint has valid == false, which is correct.
m.trackHints = make(map[uint64]*trackCrossPacketHint, len(tracks))
for _, t := range tracks {
m.trackHints[t.Number] = &trackCrossPacketHint{}
}
// Build track type and codec info maps
for _, t := range tracks {
m.trackTypes[int(t.Number)] = t.Type
nlSize := detectNALLengthSize(t.CodecID, t.CodecPrivate)
m.trackCodecs[int(t.Number)] = trackCodecInfo{
trackType: t.Type,
nalLengthSize: nlSize,
}
if t.Type == mkv.TrackTypeVideo && strings.HasPrefix(t.CodecID, "V_MPEG4/ISO/AVC") {
m.isAVCTrack[int(t.Number)] = true
}
if t.Type == mkv.TrackTypeAudio && strings.HasPrefix(t.CodecID, "A_PCM/") {
m.isPCMTrack[int(t.Number)] = true
}
if t.Type == mkv.TrackTypeAudio && t.CodecID == "A_TRUEHD" {
m.isTrueHDTrack[int(t.Number)] = true
}
}
// Reset matched regions with pre-allocated capacity
// Most packets will match, so estimate capacity as number of packets
m.matchedRegions = make([]matchedRegion, 0, len(packets))
// Initialize coverage bitmap
// Each uint64 holds 64 chunk bits, so we need (numChunks + 63) / 64 uint64s
numChunks := (m.mkvSize + coverageChunkSize - 1) / coverageChunkSize
m.coveredChunks = make([]uint64, (numChunks+63)/64)
// Pre-sort source locations by offset to enable binary search for
// locality-aware matching. One-time cost before concurrent access.
m.sourceIndex.SortLocationsByOffset()
// Set appropriate madvise hints for matching access patterns.
m.sourceIndex.AdviseForMatching()
result := &Result{
TotalPackets: len(packets),
}
// Use parallel processing with worker pool
result.MatchedPackets = m.matchParallel(packets, progress)
if progress != nil {
progress(len(packets), len(packets))
}
// Print diagnostic summary (verbose only)
if m.verboseWriter != nil {
w := m.verboseWriter
fmt.Fprintf(w, "\n=== Video Matching Diagnostics ===\n")
fmt.Fprintf(w, "Video packets total: %d\n", m.diagVideoPacketsTotal.Load())
fmt.Fprintf(w, "Video packets skip-covered: %d\n", m.diagVideoPacketsCoverage.Load())
fmt.Fprintf(w, "Video NALs total: %d\n", m.diagVideoNALsTotal.Load())
fmt.Fprintf(w, "Video NALs too small: %d\n", m.diagVideoNALsTooSmall.Load())
fmt.Fprintf(w, "Video NALs hash not found: %d\n", m.diagVideoNALsHashNotFound.Load())
fmt.Fprintf(w, "Video NALs verify failed: %d\n", m.diagVideoNALsVerifyFailed.Load())
fmt.Fprintf(w, "Video NALs all skipped: %d\n", m.diagVideoNALsAllSkipped.Load())
fmt.Fprintf(w, "Video NALs matched: %d\n", m.diagVideoNALsMatched.Load())
fmt.Fprintf(w, "Video NALs matched bytes: %d (%.2f MB)\n",
m.diagVideoNALsMatchedBytes.Load(), float64(m.diagVideoNALsMatchedBytes.Load())/(1024*1024))
fmt.Fprintf(w, "Video NALs isVideo skips: %d\n", m.diagVideoNALsSkippedIsVideo.Load())
if len(m.isAVCTrack) > 0 {
fmt.Fprintf(w, "\nPer-NAL-type breakdown (H.264, type: total / matched / not_found / miss%%):\n")
nalTypeNames := map[byte]string{
1: "non-IDR slice", 2: "slice A", 3: "slice B", 4: "slice C",
5: "IDR slice", 6: "SEI", 7: "SPS", 8: "PPS", 9: "AUD", 12: "filler",
}
for i := 0; i < 32; i++ {
total := m.diagNALTypeTotal[i].Load()
if total == 0 {
continue
}
matched := m.diagNALTypeMatched[i].Load()
notFound := m.diagNALTypeNotFound[i].Load()
name := nalTypeNames[byte(i)]
if name == "" {
name = "other"
}
fmt.Fprintf(w, " type %2d (%14s): %8d / %8d / %8d (%.1f%% miss)\n",
i, name, total, matched, notFound, float64(notFound)/float64(total)*100)
}
}
// NAL size bucket breakdown
nalSizeBucketNames := [5]string{"<64B", "64-127B", "128B-1KB", "1KB-32KB", "32KB+"}
fmt.Fprintf(w, "\nVideo NAL size distribution (matched / unmatched):\n")
for i := 0; i < 5; i++ {
matched := m.diagNALSizeMatched[i].Load()
unmatched := m.diagNALSizeUnmatched[i].Load()
if matched > 0 || unmatched > 0 {
fmt.Fprintf(w, " %9s: %8d matched, %8d unmatched\n",
nalSizeBucketNames[i], matched, unmatched)
}
}
fmt.Fprintf(w, "\nTotal match attempts: %d\n", m.diagTotalSyncPoints.Load())
fmt.Fprintf(w, "Phase 1 skips (Phase 2 avoided): %d\n", m.diagPhase1Skips.Load())
fmt.Fprintf(w, "Phase 2 full-search fallbacks: %d\n", m.diagPhase2Fallbacks.Load())
fmt.Fprintf(w, "Phase 2 total locations checked: %d\n", m.diagPhase2Locations.Load())
fmt.Fprintf(w, "Phase 2 early exits: %d\n", m.diagPhase2EarlyExits.Load())
fmt.Fprintf(w, "Phase 2 capped (hit %d limit): %d\n", phase2MaxVerifyAttempts, m.diagPhase2Capped.Load())
fmt.Fprintf(w, "\nLocality recovery:\n")
fmt.Fprintf(w, " Attempts: %d\n", m.diagLocalityAttempts.Load())
fmt.Fprintf(w, " Matched: %d\n", m.diagLocalityMatched.Load())
fmt.Fprintf(w, " Bytes: %d\n", m.diagLocalityMatchedBytes.Load())
fmt.Fprintf(w, "\nFirst hash-not-found examples:\n")
for _, ex := range m.diagExamplesOutput {
fmt.Fprintf(w, "%s\n", ex)
}
fmt.Fprintf(w, "=================================\n")
}
// Fill TrueHD gaps using adjacent matched regions
m.fillTrueHDGaps(packets)
// Merge overlapping regions and build final entries
m.mergeRegions()
var buildErr error
result.Entries, result.DeltaFile, buildErr = m.buildEntries()
if buildErr != nil {
return nil, fmt.Errorf("build entries: %w", buildErr)
}
// Calculate statistics
for _, e := range result.Entries {
if e.Source == 0 {
result.UnmatchedBytes += e.Length
} else {
result.MatchedBytes += e.Length
}
}
return result, nil
}
// ProbeHash represents a hash computed from a sync point in packet data.
type ProbeHash struct {
Hash uint64
IsVideo bool
}
// ExtractProbeHashes extracts probe hashes from packet data using sync point detection.
// This is the same algorithm used by the matcher to find matching points.
// The data should be the first few KB of a packet (typically up to 4096 bytes).
// windowSize should match the source index window size (typically 64 bytes).
// nalLengthSize is 0 for Annex B video, or 1/2/4 for AVCC/HVCC video.
// Returns nil if no valid hashes could be extracted.
func ExtractProbeHashes(data []byte, isVideo bool, windowSize int, nalLengthSize int) []ProbeHash {
if len(data) < windowSize {
return nil
}
var hashes []ProbeHash
// Find sync points within the packet data
var syncPoints []int
if isVideo {
if nalLengthSize > 0 {
syncPoints = source.FindAVCCNALStarts(data, nalLengthSize)
} else {
syncPoints = source.FindVideoNALStarts(data)
}
} else {
syncPoints = source.FindAudioSyncPoints(data)
}
// Hash from sync points
for _, syncOff := range syncPoints {
if syncOff+windowSize > len(data) {
continue
}
hash := xxhash.Sum64(data[syncOff : syncOff+windowSize])
hashes = append(hashes, ProbeHash{
Hash: hash,
IsVideo: isVideo,
})
}
// If no sync points found, try from data start
if len(hashes) == 0 {
hash := xxhash.Sum64(data[:windowSize])
hashes = append(hashes, ProbeHash{
Hash: hash,
IsVideo: isVideo,
})
}
return hashes
}
package matcher
import (
"fmt"
"sort"
)
// mergeRegions merges overlapping matched regions.
// Regions from the same source with consistent offset mappings are merged into one.
// Overlapping regions from different sources (or inconsistent offsets) are clipped:
// the earlier region keeps its full range, the later region is trimmed to start
// after the earlier one ends.
func (m *Matcher) mergeRegions() {
if len(m.matchedRegions) == 0 {
return
}
// Sort by start offset
sort.Slice(m.matchedRegions, func(i, j int) bool {
return m.matchedRegions[i].mkvStart < m.matchedRegions[j].mkvStart
})
// Merge overlapping regions
// Pre-allocate with capacity since merged will be at most len(matchedRegions)
merged := make([]matchedRegion, 1, len(m.matchedRegions))
merged[0] = m.matchedRegions[0]
for i := 1; i < len(m.matchedRegions); i++ {
curr := m.matchedRegions[i]
last := &merged[len(merged)-1]
if curr.mkvStart >= last.mkvEnd {
// No overlap - add new region
merged = append(merged, curr)
continue
}
// Regions overlap. Check if they're from the same source with consistent
// offset mapping, meaning the overlapping bytes map to the same source bytes.
expectedSrcOffset := last.srcOffset + (curr.mkvStart - last.mkvStart)
sameMapping := curr.fileIndex == last.fileIndex &&
curr.srcOffset == expectedSrcOffset &&
curr.isVideo == last.isVideo &&
curr.audioSubStreamID == last.audioSubStreamID
if sameMapping {
// Same source, consistent mapping - safe to extend since both regions
// were independently verified and the combined range maps correctly.
if curr.mkvEnd > last.mkvEnd {
last.mkvEnd = curr.mkvEnd
}
} else if curr.mkvEnd > last.mkvEnd {
// Different source or inconsistent mapping. The earlier region (last)
// keeps priority. Clip curr to start where last ends.
overlap := last.mkvEnd - curr.mkvStart
curr.mkvStart = last.mkvEnd
curr.srcOffset += overlap
// After clipping, curr may have zero or negative length if the overlap
// equals or exceeds the original region size. Only keep valid regions.
if curr.mkvStart < curr.mkvEnd {
merged = append(merged, curr)
}
}
// If curr is fully contained in last, drop it (nothing to add).
}
m.matchedRegions = merged
}
// buildEntries creates the final entry list and streams delta data to a temp file.
func (m *Matcher) buildEntries() ([]Entry, *DeltaWriter, error) {
entries := make([]Entry, 0, len(m.matchedRegions)*2+1)
deltaWriter, err := NewDeltaWriter()
if err != nil {
return nil, nil, err
}
deltaOffset := int64(0)
pos := int64(0)
regionIdx := 0
for pos < m.mkvSize {
var inRegion *matchedRegion
if regionIdx < len(m.matchedRegions) && m.matchedRegions[regionIdx].mkvStart <= pos {
inRegion = &m.matchedRegions[regionIdx]
}
if inRegion != nil && pos >= inRegion.mkvStart && pos < inRegion.mkvEnd {
offsetInRegion := pos - inRegion.mkvStart
regionLen := inRegion.mkvEnd - pos
entries = append(entries, Entry{
MkvOffset: pos,
Length: regionLen,
Source: uint16(inRegion.fileIndex + 1),
SourceOffset: inRegion.srcOffset + offsetInRegion,
IsVideo: inRegion.isVideo,
AudioSubStreamID: inRegion.audioSubStreamID,
IsLPCM: inRegion.isLPCM,
})
pos = inRegion.mkvEnd
regionIdx++
} else {
gapEnd := m.mkvSize
if regionIdx < len(m.matchedRegions) {
gapEnd = m.matchedRegions[regionIdx].mkvStart
}
gapLen := gapEnd - pos
if gapEnd <= m.mkvSize {
entries = append(entries, Entry{
MkvOffset: pos,
Length: gapLen,
Source: 0,
SourceOffset: deltaOffset,
})
// Write gap data directly from mmap to temp file
if err := deltaWriter.Write(m.mkvData[pos:gapEnd]); err != nil {
deltaWriter.Close()
return nil, nil, fmt.Errorf("write delta: %w", err)
}
deltaOffset += gapLen
}
pos = gapEnd
}
}
if err := deltaWriter.Flush(); err != nil {
deltaWriter.Close()
return nil, nil, fmt.Errorf("flush delta: %w", err)
}
return entries, deltaWriter, nil
}
package matcher
import (
"github.com/stuckj/mkvdup/internal/mkv"
"github.com/stuckj/mkvdup/internal/source"
)
// expandChunkSize is the number of bytes to read at once during match expansion.
// Larger chunks reduce page faults when expanding across mmap'd source files.
const expandChunkSize = 4096
// tryVerifyAndExpand attempts to verify and expand a match, returning the matched region or nil.
func (m *Matcher) tryVerifyAndExpand(pkt mkv.Packet, loc source.Location, offsetInPacket int64, isVideo bool) *matchedRegion {
// The MKV offset where this sync point is
mkvSyncOffset := pkt.Offset + offsetInPacket
// Verify the initial match (at least windowSize bytes)
verifyLen := int64(m.windowSize)
remainingInPacket := pkt.Size - offsetInPacket
if verifyLen > remainingInPacket {
verifyLen = remainingInPacket
}
// Zero-copy: slice directly into mmap'd data
endOffset := mkvSyncOffset + verifyLen
if endOffset > m.mkvSize {
return nil
}
mkvBuf := m.mkvData[mkvSyncOffset:endOffset]
// Read source data - use ES reader for ES-based indexes, raw slice for zero-copy
var srcBuf []byte
var err error
if m.sourceIndex.UsesESOffsets {
srcBuf, err = m.sourceIndex.ReadESDataAt(loc, int(verifyLen))
if err != nil || len(srcBuf) < int(verifyLen) {
return nil
}
} else {
// For raw indexes, use zero-copy slice
srcBuf = m.sourceIndex.RawSlice(loc, int(verifyLen))
if srcBuf == nil || len(srcBuf) < int(verifyLen) {
return nil
}
}
// Check if bytes match
for i := range mkvBuf {
if mkvBuf[i] != srcBuf[i] {
return nil
}
}
isLPCM := source.IsLPCMSubStreamID(loc.AudioSubStreamID)
// Reject LPCM source matches when the MKV track is not PCM audio.
// Without this check, coincidental byte-level matches between non-PCM
// MKV data (e.g., AC3) and LPCM source data produce entries flagged
// as LPCM. During reconstruction, the byte-swap transform is applied
// to these entries, corrupting the output and causing verification failure.
// Check before expansion to avoid unnecessary work.
if isLPCM && !m.isPCMTrack[int(pkt.TrackNum)] {
return nil
}
// Expand the match from the sync point
mkvStart, srcStart, matchLen := m.expandMatch(
mkvSyncOffset, loc, verifyLen,
)
// For LPCM entries, align boundaries to 2-byte sample pairs.
// The byte-swap transform operates on pairs; an unpaired byte at either
// end cannot be correctly swapped during FUSE reconstruction.
if isLPCM && matchLen > 1 {
if srcStart%2 == 1 {
mkvStart++
srcStart++
matchLen--
}
if matchLen%2 == 1 {
matchLen--
}
}
if matchLen <= 0 {
return nil
}
region := &matchedRegion{
mkvStart: mkvStart,
mkvEnd: mkvStart + matchLen,
fileIndex: loc.FileIndex,
srcOffset: srcStart,
isVideo: isVideo,
audioSubStreamID: loc.AudioSubStreamID,
isLPCM: isLPCM,
}
return region
}
// expandMatch expands a verified match in both directions.
func (m *Matcher) expandMatch(mkvOffset int64, loc source.Location, initialLen int64) (mkvStart, srcStart, length int64) {
mkvStart = mkvOffset
srcStart = loc.Offset
length = initialLen
// Get source size for bounds checking
var srcSize int64
if m.sourceIndex.UsesESOffsets && int(loc.FileIndex) < len(m.sourceIndex.ESReaders) {
if loc.IsVideo {
srcSize = m.sourceIndex.ESReaders[loc.FileIndex].TotalESSize(true)
} else {
srcSize = m.sourceIndex.ESReaders[loc.FileIndex].AudioSubStreamESSize(loc.AudioSubStreamID)
}
} else {
if int(loc.FileIndex) < len(m.sourceIndex.Files) {
srcSize = m.sourceIndex.Files[loc.FileIndex].Size
}
}
if m.sourceIndex.UsesESOffsets {
m.expandMatchES(mkvOffset, loc, srcSize, &mkvStart, &srcStart, &length)
} else {
m.expandMatchRaw(mkvOffset, loc, srcSize, &mkvStart, &srcStart, &length)
}
return mkvStart, srcStart, length
}
// expandMatchES expands a match using byte-by-byte ES reads with range hints.
// This is optimized for DVD MPEG-PS sources where ES data is non-contiguous.
func (m *Matcher) expandMatchES(mkvOffset int64, loc source.Location, srcSize int64, mkvStart, srcStart, length *int64) {
// Expand backward
backwardHint := -1
backwardExpanded := int64(0)
for *mkvStart > 0 && *srcStart > 0 && backwardExpanded < MaxExpansionBytes {
mkvByte := m.mkvData[*mkvStart-1]
readLoc := source.Location{
FileIndex: loc.FileIndex,
Offset: *srcStart - 1,
IsVideo: loc.IsVideo,
AudioSubStreamID: loc.AudioSubStreamID,
}
srcByteVal, hint, ok := m.sourceIndex.ReadESByteWithHint(readLoc, backwardHint)
backwardHint = hint
if !ok || mkvByte != srcByteVal {
break
}
*mkvStart--
*srcStart--
*length++
backwardExpanded++
}
// Expand forward
forwardHint := -1
mkvEnd := *mkvStart + *length
srcEnd := *srcStart + *length
forwardExpanded := int64(0)
for mkvEnd < m.mkvSize && srcEnd < srcSize && forwardExpanded < MaxExpansionBytes {
mkvByte := m.mkvData[mkvEnd]
readLoc := source.Location{
FileIndex: loc.FileIndex,
Offset: srcEnd,
IsVideo: loc.IsVideo,
AudioSubStreamID: loc.AudioSubStreamID,
}
srcByteVal, hint, ok := m.sourceIndex.ReadESByteWithHint(readLoc, forwardHint)
forwardHint = hint
if !ok || mkvByte != srcByteVal {
break
}
mkvEnd++
srcEnd++
*length++
forwardExpanded++
}
}
// expandMatchRaw expands a match using chunked reads from raw mmap'd source files.
// Reads 4KB chunks at a time to reduce page faults compared to byte-by-byte access.
func (m *Matcher) expandMatchRaw(mkvOffset int64, loc source.Location, srcSize int64, mkvStart, srcStart, length *int64) {
// Expand backward in chunks
backwardExpanded := int64(0)
for *mkvStart > 0 && *srcStart > 0 && backwardExpanded < MaxExpansionBytes {
// Determine chunk size
chunkLen := int64(expandChunkSize)
if chunkLen > *srcStart {
chunkLen = *srcStart
}
if chunkLen > *mkvStart {
chunkLen = *mkvStart
}
if chunkLen > MaxExpansionBytes-backwardExpanded {
chunkLen = MaxExpansionBytes - backwardExpanded
}
if chunkLen <= 0 {
break
}
srcChunk := m.sourceIndex.RawSlice(source.Location{
FileIndex: loc.FileIndex,
Offset: *srcStart - chunkLen,
}, int(chunkLen))
if len(srcChunk) == 0 {
break
}
// Compare backwards through the chunk
mkvChunkStart := *mkvStart - int64(len(srcChunk))
matched := int64(0)
for i := len(srcChunk) - 1; i >= 0; i-- {
if srcChunk[i] != m.mkvData[mkvChunkStart+int64(i)] {
break
}
matched++
}
if matched == 0 {
break
}
*mkvStart -= matched
*srcStart -= matched
*length += matched
backwardExpanded += matched
if matched < int64(len(srcChunk)) {
break
}
}
// Expand forward in chunks
mkvEnd := *mkvStart + *length
srcEnd := *srcStart + *length
forwardExpanded := int64(0)
for mkvEnd < m.mkvSize && srcEnd < srcSize && forwardExpanded < MaxExpansionBytes {
chunkLen := int64(expandChunkSize)
if chunkLen > srcSize-srcEnd {
chunkLen = srcSize - srcEnd
}
if chunkLen > m.mkvSize-mkvEnd {
chunkLen = m.mkvSize - mkvEnd
}
if chunkLen > MaxExpansionBytes-forwardExpanded {
chunkLen = MaxExpansionBytes - forwardExpanded
}
if chunkLen <= 0 {
break
}
srcChunk := m.sourceIndex.RawSlice(source.Location{
FileIndex: loc.FileIndex,
Offset: srcEnd,
}, int(chunkLen))
if len(srcChunk) == 0 {
break
}
// Compare forward through the chunk
matched := int64(0)
for i := 0; i < len(srcChunk); i++ {
if srcChunk[i] != m.mkvData[mkvEnd+int64(i)] {
break
}
matched++
}
if matched == 0 {
break
}
mkvEnd += matched
srcEnd += matched
*length += matched
forwardExpanded += matched
if matched < int64(len(srcChunk)) {
break
}
}
}
package matcher
import (
"fmt"
"github.com/stuckj/mkvdup/internal/mkv"
"github.com/stuckj/mkvdup/internal/source"
)
// localityVerifyLen is the minimum number of bytes needed for a reliable
// locality-based match. NALs smaller than this are skipped.
const localityVerifyLen = 64
// alignSearchRange is the number of byte offsets to try in each direction
// when aligning the predicted source offset to the actual NAL header position.
// The prediction can be off by a few bytes due to AVCC vs Annex B framing
// differences (length prefix size vs start code size).
const alignSearchRange = 3
// minAlignBytes is the minimum number of leading bytes that must match
// to confirm alignment between MKV and source NAL data. Set to 4 to avoid
// false positives — H.264 NALs have predictable first 2 bytes (NAL type +
// first slice header byte), so 2 bytes is insufficient across 7 candidates.
const minAlignBytes = 4
// tryLocalityMatch attempts to recover a NAL that failed hash-based matching
// by using the per-track locality hint to predict the source location. This
// recovers NALs that the indexer missed during source indexing — the bytes
// exist in the source but were never hashed into the index.
//
// The approach compares leading bytes at nearby offsets to align the
// prediction, then verifies the full NAL matches byte-for-byte.
//
// Returns a normal matchedRegion, or nil if no match found.
func (m *Matcher) tryLocalityMatch(
pkt mkv.Packet,
syncOff int,
mkvNALData []byte,
loc packetLocality,
nalSize int,
) *matchedRegion {
// Only attempt if we have valid per-packet locality and a large enough NAL
if !loc.valid || loc.srcEnd <= 0 || loc.mkvEnd <= 0 {
return nil
}
if nalSize < localityVerifyLen || len(mkvNALData) < nalSize {
return nil
}
// Predict approximate source ES offset. Within a single MKV packet,
// NALs are packed sequentially, so the MKV offset delta closely
// matches the source ES offset delta (differing only by framing:
// AVCC length prefix vs Annex B start code, typically ±1 byte).
// Across packets, MKV offsets include container overhead (cluster/block
// headers, other tracks' data) that doesn't exist in the source ES,
// making the prediction unreliable. Skip if the gap is too large.
currentMkvOff := pkt.Offset + int64(syncOff)
mkvDelta := currentMkvOff - loc.mkvEnd
if mkvDelta < 0 || mkvDelta > int64(nalSize)*2 {
return nil
}
predictedSrcOff := loc.srcEnd + mkvDelta
if predictedSrcOff < 0 {
return nil
}
// Count actual IO-probing attempts (after all early-exit guards)
m.diagLocalityAttempts.Add(1)
debugN := m.diagLocalityAttempts.Load()
debug := m.verboseWriter != nil && debugN <= 10
hintFileIndex := loc.fileIdx
if debug {
fmt.Fprintf(m.verboseWriter, "[locality#%d] mkvOff=%d nalSize=%d nalHdr=%02x predictedSrc=%d fileIdx=%d\n",
debugN, currentMkvOff, nalSize, mkvNALData[0], predictedSrcOff, hintFileIndex)
}
// Try to align the predicted offset to the actual NAL header position
// by comparing leading bytes. The prediction can be off by a few bytes
// due to AVCC (4-byte length prefix) vs Annex B (3-4 byte start code)
// framing differences. We try offsets around the prediction and look for
// the first position where the initial bytes match the MKV NAL data.
srcNALOffset := int64(-1)
for delta := -alignSearchRange; delta <= alignSearchRange; delta++ {
candidateOff := predictedSrcOff + int64(delta)
if candidateOff < 0 {
continue
}
loc := source.Location{
FileIndex: hintFileIndex,
Offset: candidateOff,
IsVideo: true,
}
probe, err := m.sourceIndex.ReadESDataAt(loc, minAlignBytes)
if err != nil || len(probe) < minAlignBytes {
continue
}
// Check if the first minAlignBytes bytes match the MKV NAL data
match := true
for i := 0; i < minAlignBytes; i++ {
if probe[i] != mkvNALData[i] {
match = false
break
}
}
if match {
srcNALOffset = candidateOff
break
}
}
if srcNALOffset < 0 {
if debug {
fmt.Fprintf(m.verboseWriter, "[locality#%d] alignment failed\n", debugN)
}
return nil
}
// Read source NAL data at the aligned offset and verify full match.
srcLoc := source.Location{
FileIndex: hintFileIndex,
Offset: srcNALOffset,
IsVideo: true,
}
srcData, err := m.sourceIndex.ReadESDataAt(srcLoc, nalSize)
if err != nil || len(srcData) < nalSize {
if debug {
fmt.Fprintf(m.verboseWriter, "[locality#%d] source read failed: err=%v len=%d need=%d\n", debugN, err, len(srcData), nalSize)
}
return nil
}
// Verify every byte matches
for i := 0; i < nalSize; i++ {
if mkvNALData[i] != srcData[i] {
if debug {
fmt.Fprintf(m.verboseWriter, "[locality#%d] mismatch at byte %d: src=%02x mkv=%02x\n", debugN, i, srcData[i], mkvNALData[i])
}
return nil
}
}
// Success — exact match found via locality prediction.
if debug {
fmt.Fprintf(m.verboseWriter, "[locality#%d] exact match at srcOff=%d\n", debugN, srcNALOffset)
}
mkvStart := pkt.Offset + int64(syncOff)
mkvEnd := mkvStart + int64(nalSize)
m.diagLocalityMatched.Add(1)
m.diagLocalityMatchedBytes.Add(int64(nalSize))
return &matchedRegion{
mkvStart: mkvStart,
mkvEnd: mkvEnd,
fileIndex: hintFileIndex,
srcOffset: srcNALOffset,
isVideo: true,
}
}
package matcher
import (
"fmt"
"sync"
"sync/atomic"
"github.com/cespare/xxhash/v2"
"github.com/stuckj/mkvdup/internal/mkv"
"github.com/stuckj/mkvdup/internal/source"
)
// computeNALSize computes the NAL/sync-unit or frame size from the sync point layout.
// For AVCC, consecutive sync points are separated by the length prefix of the
// next NAL. For Annex B video, sync points correspond to NAL/sync-unit boundaries
// (e.g. slice headers, sequence headers), not necessarily whole decoded frames;
// for audio and subtitles, consecutive sync points typically delimit frame boundaries.
// Returns (nalSize, exact). exact is true only when derived from a known next
// sync point; when false, nalSize is just the remaining data in the (possibly
// truncated) buffer and must not be used for short-circuit decisions.
func computeNALSize(syncPoints []int, i, syncOff, dataLen int, isVideo bool, nalLengthSize int) (int, bool) {
nalSize := dataLen - syncOff
if i+1 < len(syncPoints) {
if isVideo && nalLengthSize > 0 {
return syncPoints[i+1] - nalLengthSize - syncOff, true
}
return syncPoints[i+1] - syncOff, true
}
return nalSize, false
}
// matchParallel processes packets in parallel using a worker pool.
func (m *Matcher) matchParallel(packets []mkv.Packet, progress ProgressFunc) int {
var processedCount atomic.Int64
var matchedCount atomic.Int64
totalPackets := len(packets)
// Create work channel
workChan := make(chan mkv.Packet, m.numWorkers*2)
// Start workers
var wg sync.WaitGroup
for i := 0; i < m.numWorkers; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for pkt := range workChan {
matched := m.matchPacketParallel(pkt)
if matched {
matchedCount.Add(1)
}
count := processedCount.Add(1)
if progress != nil && count%1000 == 0 {
progress(int(count), totalPackets)
}
}
}()
}
// Send work to workers
for _, pkt := range packets {
workChan <- pkt
}
close(workChan)
// Wait for all workers to finish
wg.Wait()
return int(matchedCount.Load())
}
// matchPacketParallel is the thread-safe version of matchPacket.
func (m *Matcher) matchPacketParallel(pkt mkv.Packet) bool {
// Determine if this is video or audio
trackType := m.trackTypes[int(pkt.TrackNum)]
isVideo := trackType == mkv.TrackTypeVideo
if isVideo {
m.diagVideoPacketsTotal.Add(1)
}
// Check if this region is already covered by a matched region
// Note: This is a relaxed check - we may miss some coverage due to race conditions,
// but that's okay since we merge overlapping regions at the end anyway
if m.isRangeCoveredParallel(pkt.Offset, pkt.Size) {
if isVideo {
m.diagVideoPacketsCoverage.Add(1)
}
return true
}
// Read packet data to find sync points (zero-copy slice access)
readSize := pkt.Size
if readSize < int64(m.windowSize) {
return false
}
// For AVCC/HVCC video, use the full packet data. AVCC parsing is O(num_NALs)
// not O(packet_size) — it reads 4-byte length fields and jumps, touching only
// ~20 bytes for a typical frame with 5 NALs. Without this, large frames with
// multiple slice NALs (common in 1080p Blu-ray) only match the first slice
// since subsequent slices start past the truncated window.
// For audio and Annex B video (linear scan), cap at 4096 to avoid waste.
var useFullPacket bool
if isVideo {
codecInfo := m.trackCodecs[int(pkt.TrackNum)]
if codecInfo.nalLengthSize > 0 {
useFullPacket = true
}
}
if !useFullPacket && readSize > 4096 {
readSize = 4096
}
// Zero-copy: slice directly into mmap'd data
endOffset := pkt.Offset + readSize
if endOffset > m.mkvSize {
endOffset = m.mkvSize
}
data := m.mkvData[pkt.Offset:endOffset]
if len(data) < m.windowSize {
return false
}
// Find sync points within the packet data
var syncPoints []int
codecInfo := m.trackCodecs[int(pkt.TrackNum)]
if isVideo {
if codecInfo.nalLengthSize > 0 {
// AVCC/HVCC format: parse length-prefixed NAL units
syncPoints = source.FindAVCCNALStarts(data, codecInfo.nalLengthSize)
} else {
// Annex B format: find NAL starts after 00 00 01
syncPoints = source.FindVideoNALStarts(data)
}
} else if trackType == mkv.TrackTypeSubtitle {
syncPoints = source.FindPGSSyncPoints(data)
} else if m.isPCMTrack[int(pkt.TrackNum)] {
syncPoints = source.FindLPCMMatchSyncPoints(data)
} else {
syncPoints = source.FindAudioSyncPoints(data)
}
// Read cross-packet hint once at packet start (mutex-protected, consistent snapshot).
crossHint := m.trackHints[pkt.TrackNum]
var pktLoc packetLocality
crossHint.mu.Lock()
if crossHint.valid {
pktLoc.valid = true
pktLoc.fileIdx = crossHint.fileIdx
pktLoc.offset = crossHint.offset
pktLoc.srcEnd = crossHint.srcEnd
pktLoc.mkvEnd = crossHint.mkvEnd
}
crossHint.mu.Unlock()
// recordMatch handles bookkeeping for a successful match (hash-based
// or locality-based): adds the region, marks coverage, updates state.
recordMatch := func(region *matchedRegion, nalSize int, nalType ...byte) {
matchLen := region.mkvEnd - region.mkvStart
m.regionsMu.Lock()
m.matchedRegions = append(m.matchedRegions, *region)
m.regionsMu.Unlock()
m.markChunksCovered(region.mkvStart, region.mkvEnd)
// Update per-packet locality (deterministic, goroutine-local)
pktLoc.valid = true
pktLoc.fileIdx = region.fileIndex
pktLoc.offset = region.srcOffset + matchLen/2
pktLoc.srcEnd = region.srcOffset + matchLen
pktLoc.mkvEnd = region.mkvEnd
if isVideo {
m.diagVideoNALsMatched.Add(1)
m.diagVideoNALsMatchedBytes.Add(matchLen)
m.diagNALSizeMatched[nalSizeBucket(nalSize)].Add(1)
if len(nalType) > 0 {
m.diagNALTypeMatched[nalType[0]].Add(1)
}
}
}
// For AVCC/HVCC video, each NAL unit has different framing bytes than the
// source (length prefix vs start code), so expansion stops at NAL boundaries.
// We must match each NAL individually to cover the full packet.
// For Annex B video (MPEG-2), expansion can cross start code boundaries
// when the source data matches. However, shared structures like sequence
// headers match many source locations with short expansions. We must
// continue trying other sync points (e.g., slice headers) to find better
// matches that cover the full packet.
anyMatched := false
for i, syncOff := range syncPoints {
if syncOff+m.windowSize > len(data) {
if isVideo {
m.diagVideoNALsTooSmall.Add(1)
m.diagNALSizeUnmatched[0].Add(1) // <64B bucket
}
continue
}
// Skip sync points whose chunk is already covered — the source data
// for this region has already been verified byte-for-byte by a prior match.
if m.isChunkCoveredParallel(pkt.Offset + int64(syncOff)) {
continue
}
if isVideo {
m.diagVideoNALsTotal.Add(1)
}
// Compute NAL/frame size from distance to next sync point.
nalSize, nalSizeExact := computeNALSize(syncPoints, i, syncOff, len(data), isVideo, codecInfo.nalLengthSize)
// H.264 NAL type diagnostics (other codecs use different type encodings)
var nalType byte
isAVC := isVideo && m.isAVCTrack[int(pkt.TrackNum)] && syncOff < len(data)
if isAVC {
nalType = data[syncOff] & 0x1F
m.diagNALTypeTotal[nalType].Add(1)
}
// Hash-based matching (all codecs)
var region *matchedRegion
if isAVC {
region = m.tryMatchFromOffsetParallel(pkt, int64(syncOff), data[syncOff:], isVideo, pktLoc, nalSize, nalSizeExact, nalType)
} else {
region = m.tryMatchFromOffsetParallel(pkt, int64(syncOff), data[syncOff:], isVideo, pktLoc, nalSize, nalSizeExact)
}
// Locality-based recovery for unmatched video NALs (all video codecs)
if region == nil && isVideo && m.sourceIndex.UsesESOffsets && nalSizeExact {
region = m.tryLocalityMatch(pkt, syncOff, data[syncOff:], pktLoc, nalSize)
}
if region != nil {
if isAVC {
recordMatch(region, nalSize, nalType)
} else {
recordMatch(region, nalSize)
}
} else if isVideo {
m.diagNALSizeUnmatched[nalSizeBucket(nalSize)].Add(1)
}
if region != nil {
anyMatched = true
if m.isRangeCoveredParallel(pkt.Offset, pkt.Size) {
break
}
}
}
// For Annex B video, if the first 4096 bytes didn't give full coverage,
// scan the rest of the packet for additional sync points. This handles
// cases where only shared structures (sequence headers) appear early
// but unique slice data further in the packet would match.
if isVideo && !useFullPacket && !m.isRangeCoveredParallel(pkt.Offset, pkt.Size) && pkt.Size > 4096 {
fullEnd := pkt.Offset + pkt.Size
if fullEnd > m.mkvSize {
fullEnd = m.mkvSize
}
fullData := m.mkvData[pkt.Offset:fullEnd]
moreSyncPoints := source.FindVideoNALStarts(fullData)
for moreIdx, syncOff := range moreSyncPoints {
if syncOff < int(readSize) {
continue // Already tried in the first pass
}
if syncOff+m.windowSize > len(fullData) {
continue
}
if m.isChunkCoveredParallel(pkt.Offset + int64(syncOff)) {
continue
}
moreNALSize, moreNALSizeExact := computeNALSize(moreSyncPoints, moreIdx, syncOff, len(fullData), isVideo, codecInfo.nalLengthSize)
region := m.tryMatchFromOffsetParallel(pkt, int64(syncOff), fullData[syncOff:], isVideo, pktLoc, moreNALSize, moreNALSizeExact)
if region != nil {
recordMatch(region, moreNALSize)
anyMatched = true
if m.isRangeCoveredParallel(pkt.Offset, pkt.Size) {
break
}
}
}
}
// Also try from packet start (in case it's already aligned)
if !anyMatched {
region := m.tryMatchFromOffsetParallel(pkt, 0, data, isVideo, pktLoc, len(data), false)
if region != nil {
recordMatch(region, len(data))
anyMatched = true
}
}
// Write back cross-packet hint (mutex-protected, consistent snapshot)
if pktLoc.valid {
crossHint.mu.Lock()
crossHint.valid = true
crossHint.fileIdx = pktLoc.fileIdx
crossHint.offset = pktLoc.offset
crossHint.srcEnd = pktLoc.srcEnd
crossHint.mkvEnd = pktLoc.mkvEnd
crossHint.mu.Unlock()
}
return anyMatched
}
// tryMatchFromOffsetParallel attempts hash-based matching for a NAL at the given
// offset. Returns the matched region or nil. The caller handles bookkeeping
// (adding to matchedRegions, marking coverage, updating locality state).
//
// Uses two-phase locality-aware matching:
// - Phase 1: If packet locality exists, try the closest hash locations first.
// - Phase 2: Fall back to trying all remaining locations.
func (m *Matcher) tryMatchFromOffsetParallel(pkt mkv.Packet, offsetInPacket int64, data []byte, isVideo bool, loc packetLocality, nalSize int, nalSizeExact bool, nalType ...byte) *matchedRegion {
if len(data) < m.windowSize {
return nil
}
m.diagTotalSyncPoints.Add(1)
window := data[:m.windowSize]
hash := xxhash.Sum64(window)
// Look up in source index (read-only, thread-safe)
locations := m.sourceIndex.Lookup(hash)
if len(locations) == 0 {
if isVideo {
m.diagVideoNALsHashNotFound.Add(1)
if len(nalType) > 0 {
m.diagNALTypeNotFound[nalType[0]].Add(1)
}
// Capture first 20 examples
if len(nalType) > 0 {
m.diagExamplesMu.Lock()
if m.diagExamplesCount < 20 {
m.diagExamplesCount++
example := fmt.Sprintf(" NAL type=%d, pktOff=%d, syncOff=%d, nalSize=%d, hash=%016x, first8bytes=%02x",
nalType[0], pkt.Offset, offsetInPacket, nalSize, hash, data[:min(8, len(data))])
m.diagExamplesOutput = append(m.diagExamplesOutput, example)
}
m.diagExamplesMu.Unlock()
}
}
return nil
}
var bestMatch *matchedRegion
bestMatchLen := int64(0)
triedVerify := false // whether any tryVerifyAndExpand was called
// Track which location indices were tried in Phase 1 (small fixed-size array)
var triedIndices [localityNearbyCount]int
triedCount := 0
// Phase 1: Locality-aware search — try nearby locations first (per-packet locality)
if loc.valid && len(locations) > 1 {
nearby := nearbyLocationIndices(locations, loc.fileIdx, loc.offset, localityNearbyCount)
for _, idx := range nearby {
triedIndices[triedCount] = idx
triedCount++
l := locations[idx]
if m.sourceIndex.UsesESOffsets && l.IsVideo != isVideo {
if isVideo {
m.diagVideoNALsSkippedIsVideo.Add(1)
}
continue
}
triedVerify = true
region := m.tryVerifyAndExpand(pkt, l, offsetInPacket, isVideo)
if region != nil {
matchLen := region.mkvEnd - region.mkvStart
if matchLen > bestMatchLen {
bestMatch = region
bestMatchLen = matchLen
}
if bestMatchLen >= localityGoodMatchThreshold || (nalSizeExact && nalSize >= m.windowSize && bestMatchLen >= int64(nalSize)) {
break
}
}
}
}
// Phase 2: Full search of remaining locations
phase2Skipped := bestMatchLen >= localityGoodMatchThreshold || (nalSizeExact && nalSize >= m.windowSize && bestMatchLen >= int64(nalSize))
if phase2Skipped {
m.diagPhase1Skips.Add(1)
}
if !phase2Skipped {
m.diagPhase2Fallbacks.Add(1)
verifyAttempts := 0
for i, l := range locations {
alreadyTried := false
for t := 0; t < triedCount; t++ {
if triedIndices[t] == i {
alreadyTried = true
break
}
}
if alreadyTried {
continue
}
if m.sourceIndex.UsesESOffsets && l.IsVideo != isVideo {
if isVideo {
m.diagVideoNALsSkippedIsVideo.Add(1)
}
continue
}
triedVerify = true
verifyAttempts++
m.diagPhase2Locations.Add(1)
region := m.tryVerifyAndExpand(pkt, l, offsetInPacket, isVideo)
if region != nil {
matchLen := region.mkvEnd - region.mkvStart
if matchLen > bestMatchLen {
bestMatch = region
bestMatchLen = matchLen
}
if bestMatchLen >= localityGoodMatchThreshold || (nalSizeExact && nalSize >= m.windowSize && bestMatchLen >= int64(nalSize)) {
m.diagPhase2EarlyExits.Add(1)
break
}
}
if verifyAttempts >= phase2MaxVerifyAttempts {
m.diagPhase2Capped.Add(1)
break
}
}
}
if bestMatch != nil {
return bestMatch
}
if isVideo {
if triedVerify {
m.diagVideoNALsVerifyFailed.Add(1)
} else {
m.diagVideoNALsAllSkipped.Add(1)
}
}
return nil
}
// nearbyLocationIndices returns up to N indices into locations that are closest
// to hintOffset within the same file as hintFileIndex. Locations must be pre-sorted
// by (FileIndex, Offset) via SortLocationsByOffset. Returns an empty slice if no
// locations are in the target file.
func nearbyLocationIndices(locations []source.Location, hintFileIndex uint16, hintOffset int64, maxCount int) []int {
n := len(locations)
if n == 0 {
return nil
}
// Binary search for the insertion point of (hintFileIndex, hintOffset)
lo, hi := 0, n
for lo < hi {
mid := lo + (hi-lo)/2
loc := locations[mid]
if loc.FileIndex < hintFileIndex || (loc.FileIndex == hintFileIndex && loc.Offset < hintOffset) {
lo = mid + 1
} else {
hi = mid
}
}
// lo is now the index of the first location >= (hintFileIndex, hintOffset)
// Radiate outward from lo to collect the closest locations in the same file
result := make([]int, 0, maxCount)
left := lo - 1
right := lo
for len(result) < maxCount && (left >= 0 || right < n) {
// Pick the closer of left and right candidates
useLeft := false
useRight := false
leftOK := left >= 0 && locations[left].FileIndex == hintFileIndex
rightOK := right < n && locations[right].FileIndex == hintFileIndex
if leftOK && rightOK {
leftDist := hintOffset - locations[left].Offset
rightDist := locations[right].Offset - hintOffset
if leftDist < 0 {
leftDist = -leftDist
}
if rightDist < 0 {
rightDist = -rightDist
}
if leftDist <= rightDist {
useLeft = true
} else {
useRight = true
}
} else if leftOK {
useLeft = true
} else if rightOK {
useRight = true
} else {
break // No more candidates in the target file
}
if useLeft {
result = append(result, left)
left--
} else if useRight {
result = append(result, right)
right++
}
}
return result
}
// isRangeCoveredParallel checks if a range is likely covered using a coverage bitmap.
// This is an O(1) check using chunk-level granularity. It may have false positives
// (multiple regions covering different chunks) but that's acceptable since we merge
// overlapping regions at the end anyway.
func (m *Matcher) isRangeCoveredParallel(offset, size int64) bool {
// Calculate chunk range
startChunk := offset / coverageChunkSize
endChunk := (offset + size - 1) / coverageChunkSize
m.coverageMu.RLock()
defer m.coverageMu.RUnlock()
// Check if all chunks in the range are covered
for chunk := startChunk; chunk <= endChunk; chunk++ {
wordIdx := chunk / 64
bitIdx := uint(chunk % 64)
if wordIdx >= int64(len(m.coveredChunks)) {
return false
}
if m.coveredChunks[wordIdx]&(1<<bitIdx) == 0 {
return false
}
}
return true
}
// isChunkCoveredParallel checks if the chunk containing absOffset is already covered.
// This is used to skip sync points that fall within already-matched regions,
// avoiding redundant hash lookups and source reads.
func (m *Matcher) isChunkCoveredParallel(absOffset int64) bool {
chunk := absOffset / coverageChunkSize
wordIdx := chunk / 64
bitIdx := uint(chunk % 64)
m.coverageMu.RLock()
defer m.coverageMu.RUnlock()
if wordIdx >= int64(len(m.coveredChunks)) {
return false
}
return m.coveredChunks[wordIdx]&(1<<bitIdx) != 0
}
// markChunksCovered marks the chunks fully contained within a region as covered.
func (m *Matcher) markChunksCovered(start, end int64) {
// Only mark chunks that are fully contained within the region
// First chunk that starts at or after 'start' and is fully contained
firstFullChunk := (start + coverageChunkSize - 1) / coverageChunkSize
// Last chunk that ends before 'end'
lastFullChunk := (end / coverageChunkSize) - 1
if firstFullChunk > lastFullChunk {
// Region doesn't fully contain any chunks
return
}
m.coverageMu.Lock()
defer m.coverageMu.Unlock()
for chunk := firstFullChunk; chunk <= lastFullChunk; chunk++ {
wordIdx := chunk / 64
bitIdx := uint(chunk % 64)
if wordIdx < int64(len(m.coveredChunks)) {
m.coveredChunks[wordIdx] |= 1 << bitIdx
}
}
}
package matcher
import (
"bufio"
"fmt"
"os"
)
// Entry represents a region in the MKV file and where its data comes from.
type Entry struct {
MkvOffset int64 // Start offset in the MKV file
Length int64 // Length of this region
Source uint16 // 0 = delta, 1+ = source file index + 1 (supports up to 65535 files)
SourceOffset int64 // Offset in source file (or ES offset for ES-based sources)
IsVideo bool // For ES-based sources: whether this is video or audio data
AudioSubStreamID byte // For ES-based audio: sub-stream ID (0x80-0x87=AC3, etc.)
IsLPCM bool // True if this is 16-bit LPCM audio requiring byte-swap on read
}
// Result contains the results of the matching process.
type Result struct {
Entries []Entry // All entries covering the entire MKV file
DeltaData []byte // Concatenated unique data (for small deltas / tests)
DeltaFile *DeltaWriter // File-backed delta data (for large files)
MatchedBytes int64 // Total bytes matched to source
UnmatchedBytes int64 // Total bytes in delta
MatchedPackets int // Number of packets that matched
TotalPackets int // Total number of packets processed
}
// DeltaSize returns the total size of delta data.
func (r *Result) DeltaSize() int64 {
if r.DeltaFile != nil {
return r.DeltaFile.Size()
}
return int64(len(r.DeltaData))
}
// Close cleans up resources held by the result (temp files).
func (r *Result) Close() {
if r.DeltaFile != nil {
r.DeltaFile.Close()
r.DeltaFile = nil
}
}
// DeltaWriter writes delta data to a temp file to avoid heap accumulation.
type DeltaWriter struct {
file *os.File
buffered *bufio.Writer
size int64
}
// NewDeltaWriter creates a DeltaWriter backed by a temp file.
func NewDeltaWriter() (*DeltaWriter, error) {
f, err := os.CreateTemp("", "mkvdup-delta-*")
if err != nil {
return nil, fmt.Errorf("create delta temp file: %w", err)
}
return &DeltaWriter{
file: f,
buffered: bufio.NewWriterSize(f, 256*1024),
}, nil
}
// Write appends data to the delta file.
func (dw *DeltaWriter) Write(data []byte) error {
n, err := dw.buffered.Write(data)
dw.size += int64(n)
return err
}
// Flush ensures all buffered data is written to disk.
func (dw *DeltaWriter) Flush() error {
return dw.buffered.Flush()
}
// Size returns the total bytes written.
func (dw *DeltaWriter) Size() int64 {
return dw.size
}
// File returns the underlying file for reading. Must call Flush() first.
func (dw *DeltaWriter) File() *os.File {
return dw.file
}
// Close removes the temp file.
func (dw *DeltaWriter) Close() {
if dw.file != nil {
name := dw.file.Name()
dw.file.Close()
os.Remove(name)
dw.file = nil
}
}
package matcher
import (
"fmt"
"sort"
"github.com/stuckj/mkvdup/internal/mkv"
"github.com/stuckj/mkvdup/internal/source"
)
// fillTrueHDGaps fills unmatched gaps in TrueHD tracks by comparing MKV data
// with source ES data between existing matched regions.
//
// MKV A_TRUEHD packets contain pure TrueHD data (AC3 stripped by the muxer).
// The source parser independently strips AC3 from the Blu-ray interleaved
// stream, but may split at slightly different boundaries. This creates small
// "extra" byte regions in the MKV that aren't in the source ES, breaking
// expansion chains from sync-point matches and leaving ~46% unmatched.
//
// The gap-fill works from existing matched regions (anchors) and fills the
// gaps between them using a greedy forward comparison that skips over extra
// MKV bytes to resynchronize with the source ES.
func (m *Matcher) fillTrueHDGaps(packets []mkv.Packet) {
// Group packets by TrueHD track
trackPackets := make(map[int][]mkv.Packet)
for _, pkt := range packets {
trackNum := int(pkt.TrackNum)
if m.isTrueHDTrack[trackNum] {
trackPackets[trackNum] = append(trackPackets[trackNum], pkt)
}
}
if len(trackPackets) == 0 {
return
}
for trackNum, pkts := range trackPackets {
sort.Slice(pkts, func(i, j int) bool {
return pkts[i].Offset < pkts[j].Offset
})
m.fillTrueHDTrackGaps(trackNum, pkts)
}
}
// fillTrueHDTrackGaps fills gaps for a single TrueHD track by finding
// existing matched regions on this track and filling the gaps between them.
func (m *Matcher) fillTrueHDTrackGaps(trackNum int, pkts []mkv.Packet) {
if len(pkts) == 0 {
return
}
// Binary search to find which packet contains a given MKV offset.
findPacketIdx := func(mkvOffset int64) int {
lo, hi := 0, len(pkts)
for lo < hi {
mid := lo + (hi-lo)/2
if pkts[mid].Offset+pkts[mid].Size <= mkvOffset {
lo = mid + 1
} else {
hi = mid
}
}
if lo < len(pkts) && mkvOffset >= pkts[lo].Offset && mkvOffset < pkts[lo].Offset+pkts[lo].Size {
return lo
}
return -1
}
// findFirstPacketAt returns the index of the first packet whose end is > mkvOffset.
findFirstPacketAt := func(mkvOffset int64) int {
lo, hi := 0, len(pkts)
for lo < hi {
mid := lo + (hi-lo)/2
if pkts[mid].Offset+pkts[mid].Size <= mkvOffset {
lo = mid + 1
} else {
hi = mid
}
}
return lo
}
// Collect matched regions that fall within this track's packets.
m.regionsMu.Lock()
var trackRegions []matchedRegion
for _, r := range m.matchedRegions {
if findPacketIdx(r.mkvStart) >= 0 {
trackRegions = append(trackRegions, r)
}
}
m.regionsMu.Unlock()
if len(trackRegions) < 2 {
if m.verboseWriter != nil {
fmt.Fprintf(m.verboseWriter, "\n[TrueHD gap-fill] track %d: only %d matched regions, need ≥2 for gap-fill\n",
trackNum, len(trackRegions))
}
return
}
// Sort by mkvStart for sequential gap processing.
sort.Slice(trackRegions, func(i, j int) bool {
return trackRegions[i].mkvStart < trackRegions[j].mkvStart
})
if m.verboseWriter != nil {
fmt.Fprintf(m.verboseWriter, "\n[TrueHD gap-fill] track %d: %d matched regions, fileIndex=%d, subStreamID=0x%02X\n",
trackNum, len(trackRegions), trackRegions[0].fileIndex, trackRegions[0].audioSubStreamID)
}
// Fill gaps between adjacent matched regions.
var newRegions []matchedRegion
var totalFilledBytes, totalGapBytes, gapsFilled, gapsSkipped int64
for i := 0; i < len(trackRegions)-1; i++ {
prev := trackRegions[i]
next := trackRegions[i+1]
// Verify both regions use the same source
if prev.fileIndex != next.fileIndex || prev.audioSubStreamID != next.audioSubStreamID {
continue
}
gapMKVStart := prev.mkvEnd
gapMKVEnd := next.mkvStart
if gapMKVEnd <= gapMKVStart {
continue
}
// Source ES gap: from end of prev's source range to start of next's source range
prevSrcEnd := prev.srcOffset + (prev.mkvEnd - prev.mkvStart)
nextSrcStart := next.srcOffset
srcGapSize := nextSrcStart - prevSrcEnd
// srcGapSize <= 0 means overlapping or backwards source offsets (invalid gap);
// srcGapSize < 16 means too small to produce a meaningful match run.
if srcGapSize <= 0 || srcGapSize < 16 {
gapsSkipped++
continue
}
// Collect TrueHD packet segments within the gap.
// Only compare bytes within actual TrueHD packets, skipping
// interleaved video/audio data from other tracks.
startPkt := findFirstPacketAt(gapMKVStart)
var segments []mkvSegment
for p := startPkt; p < len(pkts) && pkts[p].Offset < gapMKVEnd; p++ {
pkt := pkts[p]
segStart := max(pkt.Offset, gapMKVStart)
segEnd := min(pkt.Offset+pkt.Size, gapMKVEnd)
if segEnd > m.mkvSize {
segEnd = m.mkvSize
}
if segStart < segEnd {
segments = append(segments, mkvSegment{segStart, segEnd})
totalGapBytes += segEnd - segStart
}
}
if len(segments) == 0 {
gapsSkipped++
continue
}
regions := m.fillTrueHDGapSegments(segments, prevSrcEnd, srcGapSize, prev.fileIndex, prev.audioSubStreamID)
if len(regions) > 0 {
newRegions = append(newRegions, regions...)
gapsFilled++
for _, r := range regions {
totalFilledBytes += r.mkvEnd - r.mkvStart
}
}
}
// Add all new regions
if len(newRegions) > 0 {
m.regionsMu.Lock()
m.matchedRegions = append(m.matchedRegions, newRegions...)
m.regionsMu.Unlock()
for i := range newRegions {
m.markChunksCovered(newRegions[i].mkvStart, newRegions[i].mkvEnd)
}
}
if m.verboseWriter != nil {
fmt.Fprintf(m.verboseWriter, "[TrueHD gap-fill] track %d: filled %d gaps (%d bytes, %.2f MB), %d gaps skipped, total TrueHD gap bytes=%d (%.2f MB)\n",
trackNum, gapsFilled, totalFilledBytes, float64(totalFilledBytes)/(1024*1024),
gapsSkipped, totalGapBytes, float64(totalGapBytes)/(1024*1024))
}
}
// mkvSegment describes a contiguous range of MKV data to compare.
type mkvSegment struct{ start, end int64 }
// fillTrueHDGapSegments fills a gap between two matched regions using greedy
// forward comparison across multiple MKV segments (TrueHD packet portions).
//
// The MKV may contain extra bytes (from AC3 splitting differences) that aren't
// in the source ES. When a mismatch occurs, the algorithm advances the MKV
// position by one byte while keeping the source position fixed, then retries.
// Matching runs of ≥16 bytes are recorded as new matched regions.
func (m *Matcher) fillTrueHDGapSegments(
segments []mkvSegment,
srcStart, srcSize int64,
fileIndex uint16, subStreamID byte,
) []matchedRegion {
if srcSize <= 0 {
return nil
}
// Read source ES data for the gap
loc := source.Location{
FileIndex: fileIndex,
Offset: srcStart,
IsVideo: false,
AudioSubStreamID: subStreamID,
}
srcData, err := m.sourceIndex.ReadESDataAt(loc, int(srcSize))
if err != nil || len(srcData) == 0 {
return nil
}
const minRunLen = 16
var regions []matchedRegion
srcIdx := 0
runMKVStart := int64(-1)
runSrcStart := -1
// Walk each MKV segment (TrueHD packet data only)
for _, seg := range segments {
if srcIdx >= len(srcData) {
break
}
if seg.end > m.mkvSize {
continue
}
mkvData := m.mkvData[seg.start:seg.end]
for mkvOff := 0; mkvOff < len(mkvData) && srcIdx < len(srcData); {
mkvAbsPos := seg.start + int64(mkvOff)
if mkvData[mkvOff] == srcData[srcIdx] {
if runMKVStart < 0 {
runMKVStart = mkvAbsPos
runSrcStart = srcIdx
}
mkvOff++
srcIdx++
} else {
// Flush any pending run
if runMKVStart >= 0 {
runLen := mkvAbsPos - runMKVStart
if runLen >= minRunLen {
regions = append(regions, matchedRegion{
mkvStart: runMKVStart,
mkvEnd: mkvAbsPos,
fileIndex: fileIndex,
srcOffset: srcStart + int64(runSrcStart),
isVideo: false,
audioSubStreamID: subStreamID,
})
}
runMKVStart = -1
runSrcStart = -1
}
// Skip forward in MKV (extra byte not in source ES)
mkvOff++
}
}
// At segment boundary: flush any pending run since the next segment
// starts at a different MKV offset (non-TrueHD data between packets).
if runMKVStart >= 0 {
runEnd := seg.end
runLen := runEnd - runMKVStart
if runLen >= minRunLen {
regions = append(regions, matchedRegion{
mkvStart: runMKVStart,
mkvEnd: runEnd,
fileIndex: fileIndex,
srcOffset: srcStart + int64(runSrcStart),
isVideo: false,
audioSubStreamID: subStreamID,
})
}
runMKVStart = -1
runSrcStart = -1
}
}
return regions
}
// Package mkv provides functionality for parsing MKV (Matroska) files.
package mkv
import (
"encoding/binary"
"errors"
"fmt"
"io"
)
// EBML Element IDs (Matroska specification)
const (
// EBML Header elements
IDEBMLHeader = 0x1A45DFA3
IDEBMLVersion = 0x4286
IDEBMLReadVersion = 0x42F7
IDEBMLMaxIDLength = 0x42F2
IDEBMLMaxSizeLength = 0x42F3
IDDocType = 0x4282
IDDocTypeVersion = 0x4287
IDDocTypeReadVer = 0x4285
// Segment and top-level elements
IDSegment = 0x18538067
IDSeekHead = 0x114D9B74
IDInfo = 0x1549A966
IDTracks = 0x1654AE6B
IDChapters = 0x1043A770
IDCluster = 0x1F43B675
IDCues = 0x1C53BB6B
IDTags = 0x1254C367
// Cluster elements
IDTimestamp = 0xE7
IDSimpleBlock = 0xA3
IDBlockGroup = 0xA0
IDBlock = 0xA1
// Track elements
IDTrackEntry = 0xAE
IDTrackNum = 0xD7
IDTrackUID = 0x73C5
IDTrackType = 0x83
IDCodecID = 0x86
IDCodecPrivate = 0x63A2
)
// Track types
const (
TrackTypeVideo = 1
TrackTypeAudio = 2
TrackTypeComplex = 3
TrackTypeLogo = 0x10
TrackTypeSubtitle = 0x11
TrackTypeButtons = 0x12
TrackTypeControl = 0x20
)
// ErrInvalidEBML is returned when EBML parsing fails.
var ErrInvalidEBML = errors.New("invalid EBML data")
// Element represents a parsed EBML element.
type Element struct {
ID uint64 // Element ID (variable length)
Size int64 // Element size (-1 for unknown size)
DataOffset int64 // Offset of element data in file
HeaderSize int // Size of ID + Size encoding
}
// ReadElementHeader reads an EBML element header (ID and size) from the reader.
// Returns the element info and any error encountered.
func ReadElementHeader(r io.Reader, offset int64) (Element, error) {
elem := Element{DataOffset: offset}
// Read element ID (variable length, 1-4 bytes)
id, idLen, err := readVINT(r, true)
if err != nil {
return elem, fmt.Errorf("read element ID: %w", err)
}
elem.ID = id
elem.HeaderSize = idLen
// Read element size (variable length, 1-8 bytes)
size, sizeLen, err := readVINT(r, false)
if err != nil {
return elem, fmt.Errorf("read element size: %w", err)
}
elem.HeaderSize += sizeLen
// Handle unknown size (all 1 bits after VINT marker)
if isUnknownSize(size, sizeLen) {
elem.Size = -1
} else {
elem.Size = int64(size)
}
elem.DataOffset = offset + int64(elem.HeaderSize)
return elem, nil
}
// readVINT reads a variable-length integer (VINT) used in EBML.
// If keepMarker is true, the VINT marker bit is preserved (used for IDs).
// Returns the value, number of bytes read, and any error.
func readVINT(r io.Reader, keepMarker bool) (uint64, int, error) {
// Read first byte to determine length
var firstByte [1]byte
if _, err := io.ReadFull(r, firstByte[:]); err != nil {
return 0, 0, err
}
b := firstByte[0]
if b == 0 {
return 0, 0, ErrInvalidEBML
}
// Determine length from leading zeros
var length int
var mask byte
switch {
case b&0x80 != 0:
length = 1
mask = 0x7F
case b&0x40 != 0:
length = 2
mask = 0x3F
case b&0x20 != 0:
length = 3
mask = 0x1F
case b&0x10 != 0:
length = 4
mask = 0x0F
case b&0x08 != 0:
length = 5
mask = 0x07
case b&0x04 != 0:
length = 6
mask = 0x03
case b&0x02 != 0:
length = 7
mask = 0x01
case b&0x01 != 0:
length = 8
mask = 0x00
default:
return 0, 0, ErrInvalidEBML
}
// Build the value
var value uint64
if keepMarker {
value = uint64(b)
} else {
value = uint64(b & mask)
}
// Read remaining bytes
if length > 1 {
remaining := make([]byte, length-1)
if _, err := io.ReadFull(r, remaining); err != nil {
return 0, 0, err
}
for _, rb := range remaining {
value = (value << 8) | uint64(rb)
}
}
return value, length, nil
}
// isUnknownSize checks if a VINT value represents "unknown size".
// Unknown size is represented by all data bits being 1.
func isUnknownSize(value uint64, length int) bool {
// Unknown size values: 0x7F (1 byte), 0x3FFF (2 bytes), etc.
maxValues := []uint64{
0x7F,
0x3FFF,
0x1FFFFF,
0x0FFFFFFF,
0x07FFFFFFFF,
0x03FFFFFFFFFF,
0x01FFFFFFFFFFFF,
0x00FFFFFFFFFFFFFF,
}
if length < 1 || length > 8 {
return false
}
return value == maxValues[length-1]
}
// ReadUint reads an unsigned integer element value.
func ReadUint(r io.Reader, size int64) (uint64, error) {
if size < 0 || size > 8 {
return 0, fmt.Errorf("invalid uint size: %d", size)
}
if size == 0 {
return 0, nil
}
buf := make([]byte, size)
if _, err := io.ReadFull(r, buf); err != nil {
return 0, err
}
var value uint64
for _, b := range buf {
value = (value << 8) | uint64(b)
}
return value, nil
}
// ReadInt reads a signed integer element value.
func ReadInt(r io.Reader, size int64) (int64, error) {
u, err := ReadUint(r, size)
if err != nil {
return 0, err
}
// Sign extend if necessary
if size > 0 && u>>(uint(size)*8-1) != 0 {
// Negative number - extend sign
mask := ^uint64(0) << (uint(size) * 8)
return int64(u | mask), nil
}
return int64(u), nil
}
// ReadString reads a string element value.
func ReadString(r io.Reader, size int64) (string, error) {
if size < 0 {
return "", fmt.Errorf("invalid string size: %d", size)
}
if size == 0 {
return "", nil
}
buf := make([]byte, size)
if _, err := io.ReadFull(r, buf); err != nil {
return "", err
}
// Trim null bytes
for i := len(buf) - 1; i >= 0 && buf[i] == 0; i-- {
buf = buf[:i]
}
return string(buf), nil
}
// ReadBinary reads binary data of the specified size.
func ReadBinary(r io.Reader, size int64) ([]byte, error) {
if size < 0 {
return nil, fmt.Errorf("invalid binary size: %d", size)
}
if size == 0 {
return nil, nil
}
buf := make([]byte, size)
if _, err := io.ReadFull(r, buf); err != nil {
return nil, err
}
return buf, nil
}
// SimpleBlockHeader contains the decoded header of a SimpleBlock.
type SimpleBlockHeader struct {
TrackNumber uint64
Timestamp int16 // Relative to cluster timestamp
Flags byte // Keyframe, invisible, lacing, discardable
HeaderSize int // Total header size in bytes
}
// Block flags
const (
FlagKeyframe = 0x80
FlagInvisible = 0x08
FlagLacing = 0x06 // Mask for lacing type
FlagDiscardable = 0x01
)
// Lacing types
const (
LacingNone = 0x00
LacingXiph = 0x02
LacingFixed = 0x04
LacingEBML = 0x06
)
// ParseSimpleBlockHeader parses the header of a SimpleBlock element.
// The data should start at the beginning of the SimpleBlock element data (after ID and size).
func ParseSimpleBlockHeader(data []byte) (SimpleBlockHeader, error) {
if len(data) < 4 {
return SimpleBlockHeader{}, fmt.Errorf("SimpleBlock too short: %d bytes", len(data))
}
var header SimpleBlockHeader
offset := 0
// Track number (VINT without marker)
trackNum, trackLen := parseVINTFromBytes(data[offset:])
header.TrackNumber = trackNum
offset += trackLen
if offset+3 > len(data) {
return SimpleBlockHeader{}, fmt.Errorf("SimpleBlock header truncated")
}
// Timestamp (2 bytes, signed, big-endian)
header.Timestamp = int16(binary.BigEndian.Uint16(data[offset:]))
offset += 2
// Flags (1 byte)
header.Flags = data[offset]
offset++
header.HeaderSize = offset
return header, nil
}
// parseVINTFromBytes parses a VINT from a byte slice (without marker preservation).
func parseVINTFromBytes(data []byte) (uint64, int) {
if len(data) == 0 {
return 0, 0
}
b := data[0]
if b == 0 {
return 0, 0
}
var length int
var mask byte
switch {
case b&0x80 != 0:
length = 1
mask = 0x7F
case b&0x40 != 0:
length = 2
mask = 0x3F
case b&0x20 != 0:
length = 3
mask = 0x1F
case b&0x10 != 0:
length = 4
mask = 0x0F
default:
return 0, 0
}
if len(data) < length {
return 0, 0
}
var value uint64 = uint64(b & mask)
for i := 1; i < length; i++ {
value = (value << 8) | uint64(data[i])
}
return value, length
}
// IsKeyframe returns true if the SimpleBlock/Block is a keyframe.
func (h SimpleBlockHeader) IsKeyframe() bool {
return h.Flags&FlagKeyframe != 0
}
// LacingType returns the lacing type used in the block.
func (h SimpleBlockHeader) LacingType() byte {
return h.Flags & FlagLacing
}
package mkv
import (
"bytes"
"fmt"
"io"
"os"
"github.com/stuckj/mkvdup/internal/mmap"
)
// Packet represents a codec data packet extracted from an MKV file.
type Packet struct {
Offset int64 // Offset in the MKV file where packet data starts
Size int64 // Size of packet data
TrackNum uint64 // Track number this packet belongs to
Timestamp int64 // Absolute timestamp (cluster + block relative)
Keyframe bool // Whether this is a keyframe
}
// Track represents an MKV track (video, audio, etc).
type Track struct {
Number uint64
UID uint64
Type int
CodecID string
CodecPrivate []byte // Codec-specific init data (zero-copy slice into mmap'd data)
}
// Parser parses MKV files to extract codec packets.
type Parser struct {
path string
mmapFile *mmap.File
data []byte // Zero-copy mmap'd data
size int64
tracks []Track
packets []Packet
}
// NewParser creates a new MKV parser for the given file.
func NewParser(path string) (*Parser, error) {
info, err := os.Stat(path)
if err != nil {
return nil, fmt.Errorf("stat file: %w", err)
}
mmapFile, err := mmap.Open(path)
if err != nil {
return nil, fmt.Errorf("mmap file: %w", err)
}
return &Parser{
path: path,
mmapFile: mmapFile,
data: mmapFile.Data(),
size: info.Size(),
}, nil
}
// Close releases resources used by the parser.
func (p *Parser) Close() error {
if p.mmapFile != nil {
return p.mmapFile.Close()
}
return nil
}
// Size returns the file size.
func (p *Parser) Size() int64 {
return p.size
}
// ProgressFunc is called to report parsing progress.
type ProgressFunc func(processed, total int64)
// Parse parses the MKV file and extracts all codec packets.
// If progress is non-nil, it will be called periodically.
func (p *Parser) Parse(progress ProgressFunc) error {
offset := int64(0)
// Parse EBML header
elem, err := p.readElementAt(offset)
if err != nil {
return fmt.Errorf("read EBML header: %w", err)
}
if elem.ID != IDEBMLHeader {
return fmt.Errorf("expected EBML header, got 0x%X", elem.ID)
}
offset = elem.DataOffset + elem.Size
// Parse Segment
elem, err = p.readElementAt(offset)
if err != nil {
return fmt.Errorf("read Segment: %w", err)
}
if elem.ID != IDSegment {
return fmt.Errorf("expected Segment, got 0x%X", elem.ID)
}
segmentDataStart := elem.DataOffset
segmentEnd := elem.DataOffset + elem.Size
if elem.Size < 0 {
segmentEnd = p.size
}
// Parse segment contents
offset = segmentDataStart
var clusterTimestamp int64
for offset < segmentEnd {
if progress != nil && offset%(1024*1024) == 0 {
progress(offset, p.size)
}
elem, err = p.readElementAt(offset)
if err != nil {
if err == io.EOF {
break
}
return fmt.Errorf("read element at %d: %w", offset, err)
}
switch elem.ID {
case IDTracks:
if err := p.parseTracks(elem); err != nil {
return fmt.Errorf("parse tracks: %w", err)
}
case IDCluster:
if err := p.parseCluster(elem, &clusterTimestamp); err != nil {
return fmt.Errorf("parse cluster at %d: %w", offset, err)
}
}
// Move to next element
if elem.Size < 0 {
// Unknown size - need to scan for next element
// For now, we'll just move past the header
offset = elem.DataOffset
} else {
offset = elem.DataOffset + elem.Size
}
}
if progress != nil {
progress(p.size, p.size)
}
return nil
}
// readElementAt reads an EBML element header at the given offset.
func (p *Parser) readElementAt(offset int64) (Element, error) {
if offset >= p.size {
return Element{}, io.EOF
}
// Zero-copy: create a bytes.Reader over the slice (no data copied)
r := bytes.NewReader(p.data[offset:])
return ReadElementHeader(r, offset)
}
// parseTracks parses the Tracks element to extract track information.
func (p *Parser) parseTracks(tracksElem Element) error {
offset := tracksElem.DataOffset
end := tracksElem.DataOffset + tracksElem.Size
for offset < end {
elem, err := p.readElementAt(offset)
if err != nil {
return err
}
if elem.ID == IDTrackEntry {
track, err := p.parseTrackEntry(elem)
if err != nil {
return fmt.Errorf("parse track entry: %w", err)
}
p.tracks = append(p.tracks, track)
}
offset = elem.DataOffset + elem.Size
}
return nil
}
// parseTrackEntry parses a TrackEntry element.
func (p *Parser) parseTrackEntry(trackElem Element) (Track, error) {
var track Track
offset := trackElem.DataOffset
end := trackElem.DataOffset + trackElem.Size
for offset < end {
elem, err := p.readElementAt(offset)
if err != nil {
return track, err
}
// Zero-copy: create a bytes.Reader over the slice
r := bytes.NewReader(p.data[elem.DataOffset : elem.DataOffset+elem.Size])
switch elem.ID {
case IDTrackNum:
track.Number, _ = ReadUint(r, elem.Size)
case IDTrackUID:
track.UID, _ = ReadUint(r, elem.Size)
case IDTrackType:
t, _ := ReadUint(r, elem.Size)
track.Type = int(t)
case IDCodecID:
track.CodecID, _ = ReadString(r, elem.Size)
case IDCodecPrivate:
// Zero-copy: slice directly into mmap'd data
track.CodecPrivate = p.data[elem.DataOffset : elem.DataOffset+elem.Size]
}
offset = elem.DataOffset + elem.Size
}
return track, nil
}
// parseCluster parses a Cluster element and extracts packets.
func (p *Parser) parseCluster(clusterElem Element, clusterTimestamp *int64) error {
offset := clusterElem.DataOffset
end := clusterElem.DataOffset + clusterElem.Size
if clusterElem.Size < 0 {
// Unknown size - parse until we hit another top-level element
end = p.size
}
for offset < end {
elem, err := p.readElementAt(offset)
if err != nil {
if err == io.EOF {
break
}
return err
}
// Check if we've hit a top-level element (end of cluster with unknown size)
if isTopLevelElement(elem.ID) && clusterElem.Size < 0 {
break
}
switch elem.ID {
case IDTimestamp:
// Zero-copy: create a bytes.Reader over the slice
r := bytes.NewReader(p.data[elem.DataOffset : elem.DataOffset+elem.Size])
ts, _ := ReadUint(r, elem.Size)
*clusterTimestamp = int64(ts)
case IDSimpleBlock:
if err := p.parseSimpleBlock(elem, *clusterTimestamp); err != nil {
return fmt.Errorf("parse SimpleBlock: %w", err)
}
case IDBlockGroup:
if err := p.parseBlockGroup(elem, *clusterTimestamp); err != nil {
return fmt.Errorf("parse BlockGroup: %w", err)
}
}
offset = elem.DataOffset + elem.Size
}
return nil
}
// parseSimpleBlock parses a SimpleBlock element and adds packets.
func (p *Parser) parseSimpleBlock(elem Element, clusterTimestamp int64) error {
// Zero-copy: read header bytes directly from mmap'd data
readSize := elem.Size
if readSize > 16 {
readSize = 16 // More than enough for header
}
endOffset := elem.DataOffset + readSize
if endOffset > p.size {
endOffset = p.size
}
headerBuf := p.data[elem.DataOffset:endOffset]
if len(headerBuf) < 4 {
return fmt.Errorf("read SimpleBlock header: data too short")
}
header, err := ParseSimpleBlockHeader(headerBuf)
if err != nil {
return err
}
// The packet data follows the header
packetOffset := elem.DataOffset + int64(header.HeaderSize)
packetSize := elem.Size - int64(header.HeaderSize)
// Handle lacing if present
if header.LacingType() != LacingNone {
// For now, treat the entire laced data as one packet
// A more complete implementation would parse individual frames
p.packets = append(p.packets, Packet{
Offset: packetOffset,
Size: packetSize,
TrackNum: header.TrackNumber,
Timestamp: clusterTimestamp + int64(header.Timestamp),
Keyframe: header.IsKeyframe(),
})
} else {
p.packets = append(p.packets, Packet{
Offset: packetOffset,
Size: packetSize,
TrackNum: header.TrackNumber,
Timestamp: clusterTimestamp + int64(header.Timestamp),
Keyframe: header.IsKeyframe(),
})
}
return nil
}
// parseBlockGroup parses a BlockGroup element and adds packets.
func (p *Parser) parseBlockGroup(groupElem Element, clusterTimestamp int64) error {
offset := groupElem.DataOffset
end := groupElem.DataOffset + groupElem.Size
for offset < end {
elem, err := p.readElementAt(offset)
if err != nil {
return err
}
if elem.ID == IDBlock {
// Block has same format as SimpleBlock for the header
// Zero-copy: read header bytes directly from mmap'd data
readSize := elem.Size
if readSize > 16 {
readSize = 16
}
endOffset := elem.DataOffset + readSize
if endOffset > p.size {
endOffset = p.size
}
headerBuf := p.data[elem.DataOffset:endOffset]
if len(headerBuf) < 4 {
return fmt.Errorf("read Block header: data too short")
}
header, err := ParseSimpleBlockHeader(headerBuf)
if err != nil {
return err
}
packetOffset := elem.DataOffset + int64(header.HeaderSize)
packetSize := elem.Size - int64(header.HeaderSize)
p.packets = append(p.packets, Packet{
Offset: packetOffset,
Size: packetSize,
TrackNum: header.TrackNumber,
Timestamp: clusterTimestamp + int64(header.Timestamp),
Keyframe: false, // Block doesn't have keyframe flag, would need ReferenceBlock
})
}
offset = elem.DataOffset + elem.Size
}
return nil
}
// isTopLevelElement returns true if the element ID is a top-level segment child.
func isTopLevelElement(id uint64) bool {
switch id {
case IDSeekHead, IDInfo, IDTracks, IDChapters, IDCluster, IDCues, IDTags:
return true
}
return false
}
// Packets returns all parsed packets.
func (p *Parser) Packets() []Packet {
return p.packets
}
// ParseTracksOnly parses only the track headers from the MKV file.
// This is much faster than Parse() since it stops as soon as the Tracks
// element is found, without scanning through clusters/packets.
func (p *Parser) ParseTracksOnly() error {
offset := int64(0)
// Parse EBML header
elem, err := p.readElementAt(offset)
if err != nil {
return fmt.Errorf("read EBML header: %w", err)
}
if elem.ID != IDEBMLHeader {
return fmt.Errorf("expected EBML header, got 0x%X", elem.ID)
}
offset = elem.DataOffset + elem.Size
// Parse Segment
elem, err = p.readElementAt(offset)
if err != nil {
return fmt.Errorf("read Segment: %w", err)
}
if elem.ID != IDSegment {
return fmt.Errorf("expected Segment, got 0x%X", elem.ID)
}
segmentDataStart := elem.DataOffset
segmentEnd := elem.DataOffset + elem.Size
if elem.Size < 0 {
segmentEnd = p.size
}
// Scan segment children until we find Tracks
offset = segmentDataStart
for offset < segmentEnd {
elem, err = p.readElementAt(offset)
if err != nil {
return fmt.Errorf("read element at %d: %w", offset, err)
}
if elem.ID == IDTracks {
if err := p.parseTracks(elem); err != nil {
return fmt.Errorf("parse tracks: %w", err)
}
return nil
}
// Skip to next element
if elem.Size < 0 {
return fmt.Errorf("unsupported unknown-size element 0x%X before Tracks", elem.ID)
}
offset = elem.DataOffset + elem.Size
}
return fmt.Errorf("no Tracks element found")
}
// Tracks returns all parsed tracks.
func (p *Parser) Tracks() []Track {
return p.tracks
}
// PacketCount returns the number of packets parsed.
func (p *Parser) PacketCount() int {
return len(p.packets)
}
// VideoPacketCount returns the number of video packets.
func (p *Parser) VideoPacketCount() int {
count := 0
videoTracks := make(map[uint64]bool)
for _, t := range p.tracks {
if t.Type == TrackTypeVideo {
videoTracks[t.Number] = true
}
}
for _, pkt := range p.packets {
if videoTracks[pkt.TrackNum] {
count++
}
}
return count
}
// AudioPacketCount returns the number of audio packets.
func (p *Parser) AudioPacketCount() int {
count := 0
audioTracks := make(map[uint64]bool)
for _, t := range p.tracks {
if t.Type == TrackTypeAudio {
audioTracks[t.Number] = true
}
}
for _, pkt := range p.packets {
if audioTracks[pkt.TrackNum] {
count++
}
}
return count
}
// ReadPacketData reads the data for a packet.
// Returns a slice into the mmap'd data (zero-copy).
// The returned slice is valid until Close() is called.
func (p *Parser) ReadPacketData(pkt Packet) ([]byte, error) {
endOffset := pkt.Offset + pkt.Size
if endOffset > p.size {
endOffset = p.size
}
if pkt.Offset >= p.size {
return nil, fmt.Errorf("read packet data: offset out of range")
}
// Zero-copy: return slice directly into mmap'd data
return p.data[pkt.Offset:endOffset], nil
}
// Data returns the raw mmap'd file data for zero-copy access.
// The returned slice is valid until Close() is called.
func (p *Parser) Data() []byte {
return p.data
}
// Package mmap provides zero-copy memory-mapped file access.
package mmap
import (
"fmt"
"io"
"os"
"golang.org/x/sys/unix"
)
// SourceFile provides read access to a source file, either via mmap or pread.
type SourceFile interface {
io.ReaderAt
Size() int64
Close() error
}
// MmapData provides zero-copy access to a memory-mapped file's data.
// Types implementing this interface allow callers to use direct slice access
// instead of copying through ReadAt.
type MmapData interface {
Data() []byte
}
// File provides zero-copy access to a memory-mapped file.
// Unlike golang.org/x/exp/mmap, this exposes the raw []byte slice
// allowing direct access without copying data.
type File struct {
data []byte
size int64
}
// Open opens a file and memory-maps it for reading.
// The returned File provides zero-copy access to the file contents.
func Open(path string) (*File, error) {
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("open file: %w", err)
}
defer f.Close()
info, err := f.Stat()
if err != nil {
return nil, fmt.Errorf("stat file: %w", err)
}
size := info.Size()
if size == 0 {
return &File{data: nil, size: 0}, nil
}
data, err := unix.Mmap(int(f.Fd()), 0, int(size), unix.PROT_READ, unix.MAP_SHARED)
if err != nil {
return nil, fmt.Errorf("mmap: %w", err)
}
return &File{data: data, size: size}, nil
}
// Data returns the raw byte slice for direct zero-copy access.
// The slice is valid until Close() is called.
func (m *File) Data() []byte {
return m.data
}
// Size returns the size of the mapped file in bytes.
func (m *File) Size() int64 {
return m.size
}
// Len returns the size of the mapped file as int (for compatibility).
func (m *File) Len() int {
return int(m.size)
}
// Slice returns a sub-slice of the mapped data without copying.
// Returns nil if the range is out of bounds.
func (m *File) Slice(offset int64, size int) []byte {
if offset < 0 || offset >= m.size {
return nil
}
end := offset + int64(size)
if end > m.size {
end = m.size
}
return m.data[offset:end]
}
// Advise provides hints to the kernel about expected access patterns.
// Use MADV_DONTNEED to release pages (they'll be re-faulted when accessed).
// Use MADV_SEQUENTIAL to hint sequential access pattern.
func (m *File) Advise(advice int) error {
if len(m.data) == 0 {
return nil
}
return unix.Madvise(m.data, advice)
}
// ReadAt implements io.ReaderAt by copying from the mmap'd data.
func (m *File) ReadAt(p []byte, off int64) (int, error) {
if len(p) == 0 {
return 0, nil
}
if off < 0 {
return 0, os.ErrInvalid
}
if off >= m.size {
return 0, io.EOF
}
n := copy(p, m.data[off:])
if n < len(p) {
return n, io.EOF
}
return n, nil
}
// Close unmaps the file from memory.
func (m *File) Close() error {
if m.data == nil {
return nil
}
if err := unix.Munmap(m.data); err != nil {
return err
}
m.data = nil
m.size = 0
return nil
}
package mmap
import (
"errors"
"fmt"
"io"
"os"
"sync"
"time"
"golang.org/x/sys/unix"
)
// ReadTimeoutError is returned when a pread operation exceeds the configured timeout.
type ReadTimeoutError struct {
Path string
Timeout time.Duration
}
func (e *ReadTimeoutError) Error() string {
return fmt.Sprintf("pread timeout after %s: %s", e.Timeout, e.Path)
}
// ReadBackpressureError is returned when all inflight read slots are occupied,
// indicating the network FS is likely stalled. This is distinct from
// ReadTimeoutError, which indicates a single read exceeded its deadline.
type ReadBackpressureError struct {
Path string
}
func (e *ReadBackpressureError) Error() string {
return fmt.Sprintf("pread backpressure: all %d inflight slots occupied: %s", maxInflight, e.Path)
}
// maxInflight is the maximum number of concurrent in-flight read goroutines
// per PreadFile. This bounds memory/goroutine accumulation when an NFS mount
// is stalled and reads are timing out repeatedly.
const maxInflight = 16
// PreadFile provides pread(2)-based read access to a source file, with retry
// and stale handle recovery. This is used for source files on network
// filesystems (NFS, CIFS/SMB) where mmap is unsafe due to SIGBUS on
// page fault failures.
type PreadFile struct {
mu sync.Mutex // protects file and staleFiles
file *os.File
path string
size int64
timeout time.Duration // 0 = no timeout
inflight chan struct{} // semaphore bounding concurrent timeout goroutines
staleFiles []*os.File // old fds kept open until Close to avoid EBADF on in-flight reads
}
// OpenPread opens a file for pread-based access.
func OpenPread(path string, timeout time.Duration) (*PreadFile, error) {
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("open file: %w", err)
}
info, err := f.Stat()
if err != nil {
f.Close()
return nil, fmt.Errorf("stat file: %w", err)
}
return &PreadFile{
file: f,
path: path,
size: info.Size(),
timeout: timeout,
inflight: make(chan struct{}, maxInflight),
}, nil
}
// Size returns the size of the file.
func (p *PreadFile) Size() int64 {
return p.size
}
// ReadAt reads len(buf) bytes from the file starting at byte offset off.
// If timeout is configured and the read takes too long, it returns a
// ReadTimeoutError. The underlying goroutine may continue until the kernel
// completes the I/O, but the caller is unblocked. The goroutine reads into
// a private buffer to prevent it from writing to buf after the caller has
// moved on. A per-file semaphore bounds the number of in-flight goroutines
// to prevent unbounded accumulation under a stalled NFS mount.
func (p *PreadFile) ReadAt(buf []byte, off int64) (int, error) {
if len(buf) == 0 {
return 0, nil
}
if p.timeout <= 0 {
return p.readAtWithRetry(buf, off)
}
// Acquire an inflight slot (non-blocking). If all slots are occupied
// the NFS mount is likely stalled — fail fast instead of spawning
// more goroutines.
select {
case p.inflight <- struct{}{}:
default:
return 0, &ReadBackpressureError{Path: p.path}
}
type result struct {
n int
err error
}
// Read into a private buffer so an abandoned goroutine (after timeout)
// cannot write into buf while it is being reused by the caller.
tmp := make([]byte, len(buf))
ch := make(chan result, 1)
go func() {
defer func() { <-p.inflight }()
n, err := p.readAtWithRetry(tmp, off)
ch <- result{n, err}
}()
timer := time.NewTimer(p.timeout)
defer timer.Stop()
select {
case r := <-ch:
copy(buf[:r.n], tmp[:r.n])
return r.n, r.err
case <-timer.C:
return 0, &ReadTimeoutError{Path: p.path, Timeout: p.timeout}
}
}
// readAtWithRetry performs a pread with one retry on retryable errors,
// reopening the file descriptor if needed. The mutex is only held briefly
// to copy the fd pointer — not during the pread syscall — so Close() and
// reopen() are never blocked by a stalled network read. Old fds from
// reopen are kept in staleFiles (not closed) to avoid EBADF on
// concurrent in-flight reads; they are cleaned up on Close().
func (p *PreadFile) readAtWithRetry(buf []byte, off int64) (int, error) {
p.mu.Lock()
f := p.file
p.mu.Unlock()
if f == nil {
return 0, os.ErrClosed
}
n, err := f.ReadAt(buf, off)
if err != nil && err != io.EOF && isRetryableError(err) {
if reopenErr := p.reopen(); reopenErr != nil {
return n, fmt.Errorf("pread retry failed (reopen: %w, original: %w)", reopenErr, err)
}
p.mu.Lock()
f = p.file
p.mu.Unlock()
if f == nil {
return 0, os.ErrClosed
}
n, err = f.ReadAt(buf, off)
}
return n, err
}
// reopen opens a new fd and swaps it in. The old fd is not closed
// immediately because in-flight goroutines may still hold a reference
// to it (copied under the mutex before the pread syscall). Old fds are
// collected in staleFiles and cleaned up on Close().
//
// Fd accumulation is bounded in practice: reopens only occur on transient
// network errors (ESTALE, ETIMEDOUT, etc.), which are rare. Even under
// a flaky mount, each reopen adds just one fd, well within default ulimits.
func (p *PreadFile) reopen() error {
p.mu.Lock()
defer p.mu.Unlock()
if p.file == nil {
return os.ErrClosed
}
newFile, err := os.Open(p.path)
if err != nil {
return fmt.Errorf("reopen: %w", err)
}
info, err := newFile.Stat()
if err != nil {
newFile.Close()
return fmt.Errorf("reopen stat: %w", err)
}
if info.Size() != p.size {
newFile.Close()
return fmt.Errorf("reopen: size changed (%d → %d)", p.size, info.Size())
}
p.staleFiles = append(p.staleFiles, p.file)
p.file = newFile
return nil
}
// Close closes the current file and any stale fds from previous reopens.
func (p *PreadFile) Close() error {
p.mu.Lock()
defer p.mu.Unlock()
var firstErr error
if p.file != nil {
firstErr = p.file.Close()
p.file = nil
}
for _, f := range p.staleFiles {
if err := f.Close(); err != nil && firstErr == nil {
firstErr = err
}
}
p.staleFiles = nil
return firstErr
}
// isRetryableError checks if an error is a transient network FS error
// that may succeed on retry (possibly after reopening the fd).
func isRetryableError(err error) bool {
var errno unix.Errno
if errors.As(err, &errno) {
switch errno {
case unix.ESTALE, unix.ETIMEDOUT, unix.ECONNRESET, unix.EIO:
return true
}
}
return false
}
// Package security provides file ownership and path confinement checks
// for FUSE mounts running as root.
package security
import (
"fmt"
"os"
"path/filepath"
"strings"
"syscall"
)
// fileStatFunc is a package-level var for os.Stat, allowing test injection.
var fileStatFunc = os.Stat
// Geteuid returns the effective user ID. Exported for testing.
var Geteuid = os.Geteuid
// CheckFileOwnership validates that a file is root-owned and not
// group-writable or world-writable. Returns nil if safe, or an error
// describing the violation. Only checks when running as root (euid == 0).
// The path is resolved via EvalSymlinks before checking.
func CheckFileOwnership(path string) error {
if Geteuid() != 0 {
return nil
}
resolved, err := filepath.EvalSymlinks(path)
if err != nil {
return fmt.Errorf("resolve %s: %w", path, err)
}
return checkOwnership(resolved)
}
// CheckFileOwnershipResolved is like CheckFileOwnership but skips symlink
// resolution, assuming the caller already canonicalized the path.
// Only checks when running as root (euid == 0).
func CheckFileOwnershipResolved(path string) error {
if Geteuid() != 0 {
return nil
}
return checkOwnership(path)
}
// checkOwnership performs the actual ownership and permission checks on
// an already-resolved path.
func checkOwnership(path string) error {
info, err := fileStatFunc(path)
if err != nil {
return fmt.Errorf("stat %s: %w", path, err)
}
stat, ok := info.Sys().(*syscall.Stat_t)
if !ok {
return fmt.Errorf("cannot get ownership info for %s", path)
}
if stat.Uid != 0 {
return fmt.Errorf("security: %s is owned by uid %d, not root", path, stat.Uid)
}
mode := info.Mode()
if mode&0020 != 0 {
return fmt.Errorf("security: %s is group-writable (%04o)", path, mode.Perm())
}
if mode&0002 != 0 {
return fmt.Errorf("security: %s is world-writable (%04o)", path, mode.Perm())
}
return nil
}
// CheckPathConfinement resolves sourceDir + relPath, canonicalizes via
// EvalSymlinks, and verifies the result stays within sourceDir. Returns
// the canonical path or an error. Only checks when running as root.
//
// When not running as root, returns the simple joined path without
// canonicalization (preserving existing behavior).
func CheckPathConfinement(sourceDir, relPath string) (string, error) {
// Reject absolute paths regardless of euid — filepath.Join would
// silently drop sourceDir for absolute relPath, allowing escape.
if filepath.IsAbs(relPath) {
return "", fmt.Errorf("security: absolute source path %q not allowed", relPath)
}
if Geteuid() != 0 {
// Non-root: return cleaned join without canonicalization.
// Absolute relPath is already rejected above, so Join always
// prepends sourceDir. Note that Join cleans ".." components,
// but confinement is not enforced in non-root mode.
return filepath.Join(sourceDir, relPath), nil
}
// Canonicalize sourceDir
canonicalDir, err := filepath.EvalSymlinks(sourceDir)
if err != nil {
return "", fmt.Errorf("security: resolve source dir %s: %w", sourceDir, err)
}
// Canonicalize the full path
joined := filepath.Join(sourceDir, relPath)
canonical, err := filepath.EvalSymlinks(joined)
if err != nil {
return "", fmt.Errorf("security: resolve source path %s: %w", joined, err)
}
// Use trailing separator to prevent prefix attacks
// (e.g., /data/source-evil matching /data/source)
if !strings.HasPrefix(canonical+"/", canonicalDir+"/") {
return "", fmt.Errorf("security: source path %s escapes source dir %s (resolved to %s)", relPath, sourceDir, canonical)
}
return canonical, nil
}
// CheckDirectory validates that a path is a directory, is root-owned,
// and is not group-writable or world-writable. Returns nil if safe.
// Only checks when running as root (euid == 0).
// The path is resolved via EvalSymlinks before checking.
func CheckDirectory(dir string) error {
if Geteuid() != 0 {
return nil
}
resolved, err := filepath.EvalSymlinks(dir)
if err != nil {
return fmt.Errorf("resolve %s: %w", dir, err)
}
return checkDirectory(resolved)
}
// CheckDirectoryResolved is like CheckDirectory but skips symlink
// resolution, assuming the caller already canonicalized the path.
// Only checks when running as root (euid == 0).
func CheckDirectoryResolved(dir string) error {
if Geteuid() != 0 {
return nil
}
return checkDirectory(dir)
}
// checkDirectory performs ownership and directory checks on an
// already-resolved path.
func checkDirectory(dir string) error {
if err := checkOwnership(dir); err != nil {
return err
}
info, err := fileStatFunc(dir)
if err != nil {
return fmt.Errorf("stat %s: %w", dir, err)
}
if !info.IsDir() {
return fmt.Errorf("security: %s is not a directory", dir)
}
return nil
}
package source
// FindAudioSyncPoints finds all audio sync pattern positions in the data.
// Detects AC3, DTS, TrueHD, and MPEG Audio sync patterns.
// Returns offsets where sync patterns begin.
func FindAudioSyncPoints(data []byte) []int {
if len(data) < 2 {
return nil
}
var offsets []int
for i := 0; i <= len(data)-2; i++ {
// AC3/E-AC3: 0B 77
if data[i] == 0x0B && data[i+1] == 0x77 {
offsets = append(offsets, i)
continue
}
// DTS/DTS-HD: 7F FE 80 01
if i <= len(data)-4 &&
data[i] == 0x7F && data[i+1] == 0xFE &&
data[i+2] == 0x80 && data[i+3] == 0x01 {
offsets = append(offsets, i)
continue
}
// TrueHD: F8 72 6F BA
if i <= len(data)-4 &&
data[i] == 0xF8 && data[i+1] == 0x72 &&
data[i+2] == 0x6F && data[i+3] == 0xBA {
offsets = append(offsets, i)
continue
}
// MPEG Audio / AAC ADTS: FF Fx (0xFF followed by 0xF0-0xFF)
// The sync word is 11 bits of 1s, so we check for 0xFF followed by 0xFx.
// Validate byte 2: bitrate index 1111 (upper nibble 0xF) is reserved/invalid.
// This eliminates massive false positives from 0xFF adaptation field padding
// in MPEG-TS, where every consecutive byte pair in a 0xFF run would match.
if i <= len(data)-3 &&
data[i] == 0xFF && (data[i+1]&0xF0) == 0xF0 &&
(data[i+2]&0xF0) != 0xF0 {
offsets = append(offsets, i)
continue
}
}
return offsets
}
// FindAudioSyncPointsInRange finds audio sync points within a specific range of data.
// This is useful for processing large files in chunks.
func FindAudioSyncPointsInRange(data []byte, startOffset int) []int {
if len(data) < 2 {
return nil
}
var offsets []int
for i := 0; i <= len(data)-2; i++ {
// AC3/E-AC3: 0B 77
if data[i] == 0x0B && data[i+1] == 0x77 {
offsets = append(offsets, startOffset+i)
continue
}
// DTS/DTS-HD: 7F FE 80 01
if i <= len(data)-4 &&
data[i] == 0x7F && data[i+1] == 0xFE &&
data[i+2] == 0x80 && data[i+3] == 0x01 {
offsets = append(offsets, startOffset+i)
continue
}
// TrueHD: F8 72 6F BA
if i <= len(data)-4 &&
data[i] == 0xF8 && data[i+1] == 0x72 &&
data[i+2] == 0x6F && data[i+3] == 0xBA {
offsets = append(offsets, startOffset+i)
continue
}
// MPEG Audio / AAC ADTS: FF Fx with valid bitrate index
if i <= len(data)-3 &&
data[i] == 0xFF && (data[i+1]&0xF0) == 0xF0 &&
(data[i+2]&0xF0) != 0xF0 {
offsets = append(offsets, startOffset+i)
continue
}
}
return offsets
}
// AC3FrameSize returns the frame size in bytes for an AC3 sync frame given
// the fscod (sample rate code, 2 bits) and frmsizecod (frame size code, 6 bits)
// from byte 4 of the sync frame. Returns 0 if the codes are invalid.
// Based on ATSC A/52 Table 5.18.
func AC3FrameSize(fscod, frmsizecod byte) int {
if frmsizecod >= 38 || fscod >= 3 {
return 0
}
// Frame sizes in 16-bit words, indexed by [fscod][frmsizecod]
var frameSizeWords = [3][38]int{
// 48 kHz
{64, 64, 80, 80, 96, 96, 112, 112, 128, 128, 160, 160, 192, 192, 224, 224, 256, 256, 320, 320, 384, 384, 448, 448, 512, 512, 640, 640, 768, 768, 896, 896, 1024, 1024, 1152, 1152, 1280, 1280},
// 44.1 kHz
{69, 70, 87, 88, 104, 105, 121, 122, 139, 140, 174, 175, 208, 209, 243, 244, 278, 279, 348, 349, 417, 418, 487, 488, 557, 558, 696, 697, 835, 836, 975, 976, 1114, 1115, 1253, 1254, 1393, 1394},
// 32 kHz
{96, 96, 120, 120, 144, 144, 168, 168, 192, 192, 240, 240, 288, 288, 336, 336, 384, 384, 480, 480, 576, 576, 672, 672, 768, 768, 960, 960, 1152, 1152, 1344, 1344, 1536, 1536, 1728, 1728, 1920, 1920},
}
return frameSizeWords[fscod][frmsizecod] * 2
}
// DTSCoreFrameSize parses a DTS core frame header and returns the frame size
// in bytes. The data must start at the DTS sync word (7F FE 80 01) and be at
// least 7 bytes long. Returns 0 if the header is invalid.
//
// DTS core frame header layout (after 4-byte sync word):
//
// Bit 0: frame_type (1 bit)
// Bits 1-5: deficit_samples (5 bits)
// Bit 6: crc_present (1 bit)
// Bits 7-13: npcmblocks (7 bits)
// Bits 14-27: frame_size - 1 (14 bits)
//
// Reference: ETSI TS 102 114 (DTS Coherent Acoustics), confirmed against
// ffmpeg's ff_dca_parse_core_frame_header in libavcodec/dca.c.
func DTSCoreFrameSize(data []byte) int {
if len(data) < 7 {
return 0
}
// Verify sync word
if data[0] != 0x7F || data[1] != 0xFE || data[2] != 0x80 || data[3] != 0x01 {
return 0
}
// Frame size field is 14 bits starting at bit 14 after the sync word.
// Byte 4: [frame_type(1) | deficit(5) | crc(1) | nblks[6]](8 bits)
// Byte 5: [nblks[0] | frame_size[13:7]](8 bits)
// Byte 6: [frame_size[6:0] | audio_mode[5]](8 bits)
frameSizeRaw := int(data[5]&0x7F)<<7 | int(data[6]>>1)
frameSize := frameSizeRaw + 1
if frameSize < 96 {
return 0 // Too small to be a valid DTS frame
}
return frameSize
}
// FindAllSyncPoints finds both video start codes and audio sync patterns.
// Returns combined offsets sorted by position.
func FindAllSyncPoints(data []byte) []int {
videoOffsets := FindVideoStartCodes(data)
audioOffsets := FindAudioSyncPoints(data)
// Combine and sort
combined := make([]int, 0, len(videoOffsets)+len(audioOffsets))
combined = append(combined, videoOffsets...)
combined = append(combined, audioOffsets...)
// Simple insertion sort since lists are already sorted
// and we just need to merge them
result := make([]int, 0, len(combined))
vi, ai := 0, 0
for vi < len(videoOffsets) || ai < len(audioOffsets) {
if vi >= len(videoOffsets) {
result = append(result, audioOffsets[ai])
ai++
} else if ai >= len(audioOffsets) {
result = append(result, videoOffsets[vi])
vi++
} else if videoOffsets[vi] <= audioOffsets[ai] {
result = append(result, videoOffsets[vi])
vi++
} else {
result = append(result, audioOffsets[ai])
ai++
}
}
return result
}
package source
import (
"encoding/binary"
"fmt"
"os"
"path/filepath"
"strings"
)
// parseBlurayClipInfoCodecs parses a CLPI file's ProgramInfo section to extract
// codec information. CLPI files are small metadata files in BDMV/CLIPINF/ that
// authoritatively declare every elementary stream's codec type.
//
// CLPI header layout:
//
// 0x00-0x03: Type indicator ("HDMV")
// 0x04-0x07: Version string
// 0x08-0x0B: SequenceInfo start offset (4 bytes, big-endian)
// 0x0C-0x0F: ProgramInfo start offset (4 bytes, big-endian)
//
// ProgramInfo layout:
//
// [0-3] Section length (4 bytes, big-endian)
// [4] Reserved
// [5] Number of program sequences
// Per sequence:
// [0-3] SPN_program_sequence_start (4 bytes)
// [4-5] program_map_PID (2 bytes)
// [6] num_streams_in_ps (1 byte)
// [7] num_groups (1 byte)
// Per stream:
// [0-1] stream_PID (2 bytes)
// [2] stream_coding_info_length (1 byte)
// [3] stream_coding_type (1 byte) — same values as tsStreamTypeToCodecType
func parseBlurayClipInfoCodecs(data []byte) (*SourceCodecs, error) {
if len(data) < 16 {
return nil, fmt.Errorf("CLPI data too short (%d bytes)", len(data))
}
magic := string(data[0:4])
if magic != "HDMV" {
return nil, fmt.Errorf("not a CLPI file (magic: %q)", magic)
}
progInfoOffset := binary.BigEndian.Uint32(data[12:16])
if progInfoOffset == 0 || int(progInfoOffset)+6 > len(data) {
return nil, fmt.Errorf("invalid ProgramInfo offset: %d", progInfoOffset)
}
pi := data[progInfoOffset:]
if len(pi) < 6 {
return nil, fmt.Errorf("ProgramInfo section too short")
}
piLen := binary.BigEndian.Uint32(pi[0:4])
if piLen == 0 {
return nil, fmt.Errorf("empty ProgramInfo section")
}
// Cap the section to its declared length + header.
pi = pi[:min(int(piLen)+4, len(pi))]
numSeqs := int(pi[5])
codecs := &SourceCodecs{}
off := 6
for range numSeqs {
if off+8 > len(pi) {
break
}
// SPN(4) + program_map_PID(2) + num_streams(1) + num_groups(1)
//
// num_groups (pi[off+7]) is not processed. The Blu-ray spec is proprietary
// and the group entry format is undocumented. In practice num_groups is
// always 0 on real discs, and no open-source parser (libbluray, MKVToolNix)
// processes group entries either — the field is effectively reserved.
numStreams := int(pi[off+6])
off += 8
for range numStreams {
if off+3 > len(pi) {
break
}
// stream_PID(2) + ci_len(1)
ciLen := int(pi[off+2])
if ciLen > 0 && off+3 < len(pi) {
streamType := pi[off+3]
ct := tsStreamTypeToCodecType(streamType)
if ct != CodecUnknown {
if IsVideoCodec(ct) {
if !containsCodec(codecs.VideoCodecs, ct) {
codecs.VideoCodecs = append(codecs.VideoCodecs, ct)
}
} else if IsAudioCodec(ct) {
if !containsCodec(codecs.AudioCodecs, ct) {
codecs.AudioCodecs = append(codecs.AudioCodecs, ct)
}
} else if IsSubtitleCodec(ct) {
if !containsCodec(codecs.SubtitleCodecs, ct) {
codecs.SubtitleCodecs = append(codecs.SubtitleCodecs, ct)
}
}
}
}
off += 3 + ciLen
}
}
return codecs, nil
}
// findCLPIsInISO navigates an ISO9660 filesystem to find CLPI files
// under BDMV/CLIPINF/. Returns the file extents or an error.
func findCLPIsInISO(f *os.File) ([]isoFileExtent, error) {
rootLBA, rootLen, err := readISOPVDRoot(f)
if err != nil {
return nil, err
}
rootEntries, err := readISODirectory(f, rootLBA, rootLen)
if err != nil {
return nil, fmt.Errorf("read ISO root directory: %w", err)
}
bdmv, err := findISOEntry(rootEntries, "BDMV")
if err != nil {
return nil, fmt.Errorf("find BDMV directory: %w", err)
}
bdmvEntries, err := readISODirectory(f, uint32(bdmv.Offset/isoSectorSize), uint32(bdmv.Size))
if err != nil {
return nil, fmt.Errorf("read BDMV directory: %w", err)
}
clipinf, err := findISOEntry(bdmvEntries, "CLIPINF")
if err != nil {
return nil, fmt.Errorf("find CLIPINF directory: %w", err)
}
clipinfEntries, err := readISODirectory(f, uint32(clipinf.Offset/isoSectorSize), uint32(clipinf.Size))
if err != nil {
return nil, fmt.Errorf("read CLIPINF directory: %w", err)
}
var clpis []isoFileExtent
for _, e := range clipinfEntries {
if !e.IsDir && strings.HasSuffix(e.Name, ".CLPI") {
clpis = append(clpis, e)
}
}
if len(clpis) == 0 {
return nil, fmt.Errorf("no CLPI files found in BDMV/CLIPINF/")
}
return clpis, nil
}
// findCLPIsInUDF navigates a UDF filesystem to find CLPI files under BDMV/CLIPINF/.
func findCLPIsInUDF(f *os.File) ([]isoFileExtent, error) {
ctx, err := newUDFContext(f)
if err != nil {
return nil, err
}
rootFIDs, err := ctx.readDirectoryFromFE(ctx.rootFE)
if err != nil {
return nil, fmt.Errorf("read UDF root directory: %w", err)
}
bdmvFE, err := ctx.lookupDir(rootFIDs, "BDMV")
if err != nil {
return nil, fmt.Errorf("find BDMV: %w", err)
}
bdmvFIDs, err := ctx.readDirectoryFromFE(bdmvFE)
if err != nil {
return nil, fmt.Errorf("read BDMV directory: %w", err)
}
clipinfFE, err := ctx.lookupDir(bdmvFIDs, "CLIPINF")
if err != nil {
return nil, fmt.Errorf("find CLIPINF: %w", err)
}
clipinfFIDs, err := ctx.readDirectoryFromFE(clipinfFE)
if err != nil {
return nil, fmt.Errorf("read CLIPINF directory: %w", err)
}
var clpis []isoFileExtent
for _, fid := range clipinfFIDs {
if fid.IsDir || fid.IsParent {
continue
}
name := strings.ToUpper(fid.Name)
if !strings.HasSuffix(name, ".CLPI") {
continue
}
fe, err := ctx.readFileEntryAt(fid.ICBLocation)
if err != nil {
continue
}
extents, err := ctx.resolveAllExtents(fe)
if err != nil || len(extents) == 0 {
continue
}
clpi := isoFileExtent{
Name: name,
Offset: extents[0].ISOOffset,
Size: int64(fe.InfoLength),
IsDir: false,
}
if !extentsContiguous(extents) {
clpi.Extents = extents
}
clpis = append(clpis, clpi)
}
if len(clpis) == 0 {
return nil, fmt.Errorf("no CLPI files found in UDF BDMV/CLIPINF/")
}
return clpis, nil
}
// detectBlurayCodecsFromCLPIs reads CLPI files from within an ISO and returns
// the unioned codec information from all clip info files.
func detectBlurayCodecsFromCLPIs(f *os.File, clpis []isoFileExtent) (*SourceCodecs, error) {
merged := &SourceCodecs{}
var lastErr error
anySuccess := false
for _, clpi := range clpis {
// Cap read size to prevent excessive allocation from malformed metadata.
// Real CLPI files are ~64-78KB.
const maxCLPISize int64 = 8 * 1024 * 1024
data, err := readISOFileExtent(f, clpi, maxCLPISize)
if err != nil {
lastErr = err
continue
}
codecs, err := parseBlurayClipInfoCodecs(data)
if err != nil {
lastErr = err
continue
}
mergeSourceCodecs(merged, codecs)
anySuccess = true
}
if !anySuccess {
if lastErr != nil {
return nil, fmt.Errorf("failed to parse any CLPI file: %w", lastErr)
}
return nil, fmt.Errorf("no valid CLPI files found")
}
return merged, nil
}
// detectBlurayCodecsFromCLPIDir detects codecs from CLPI files in an extracted
// Blu-ray directory structure (BDMV/CLIPINF/*.clpi).
func detectBlurayCodecsFromCLPIDir(sourceDir string) (*SourceCodecs, error) {
clipinfDir := sourceDir
// If sourceDir doesn't end with CLIPINF, try to find it
if !strings.HasSuffix(strings.ToUpper(sourceDir), "CLIPINF") {
// Look for BDMV/CLIPINF relative to sourceDir
candidates := []string{
filepath.Join(sourceDir, "BDMV", "CLIPINF"),
filepath.Join(sourceDir, "bdmv", "clipinf"),
}
found := false
for _, c := range candidates {
if info, err := os.Stat(c); err == nil && info.IsDir() {
clipinfDir = c
found = true
break
}
}
if !found {
return nil, fmt.Errorf("BDMV/CLIPINF directory not found in %s", sourceDir)
}
}
entries, err := os.ReadDir(clipinfDir)
if err != nil {
return nil, fmt.Errorf("read CLIPINF directory: %w", err)
}
merged := &SourceCodecs{}
var lastErr error
anySuccess := false
for _, entry := range entries {
if entry.IsDir() {
continue
}
name := strings.ToUpper(entry.Name())
if !strings.HasSuffix(name, ".CLPI") {
continue
}
data, err := os.ReadFile(filepath.Join(clipinfDir, entry.Name()))
if err != nil {
lastErr = err
continue
}
codecs, err := parseBlurayClipInfoCodecs(data)
if err != nil {
lastErr = err
continue
}
mergeSourceCodecs(merged, codecs)
anySuccess = true
}
if !anySuccess {
if lastErr != nil {
return nil, fmt.Errorf("failed to parse any CLPI file: %w", lastErr)
}
return nil, fmt.Errorf("no valid CLPI files found")
}
return merged, nil
}
package source
import (
"fmt"
"os"
"path/filepath"
"strings"
"github.com/stuckj/mkvdup/internal/mkv"
)
// CodecType represents a broad codec family.
type CodecType int
// Codec type constants.
const (
CodecUnknown CodecType = iota
CodecMPEG1Video
CodecMPEG2Video
CodecH264Video
CodecH265Video
CodecVC1Video
CodecAC3Audio
CodecEAC3Audio
CodecDTSAudio
CodecDTSHDAudio
CodecTrueHDAudio
CodecLPCMAudio
CodecMPEGAudio
CodecAACaudio
CodecFLACAudio
CodecOpusAudio
CodecPGSSubtitle
)
// CodecTypeName returns a human-readable name for a codec type.
func CodecTypeName(ct CodecType) string {
switch ct {
case CodecMPEG1Video:
return "MPEG-1"
case CodecMPEG2Video:
return "MPEG-2"
case CodecH264Video:
return "H.264"
case CodecH265Video:
return "H.265"
case CodecVC1Video:
return "VC-1"
case CodecAC3Audio:
return "AC3"
case CodecEAC3Audio:
return "E-AC3"
case CodecDTSAudio:
return "DTS"
case CodecDTSHDAudio:
return "DTS-HD"
case CodecTrueHDAudio:
return "TrueHD"
case CodecLPCMAudio:
return "LPCM"
case CodecMPEGAudio:
return "MPEG Audio"
case CodecAACaudio:
return "AAC"
case CodecFLACAudio:
return "FLAC"
case CodecOpusAudio:
return "Opus"
case CodecPGSSubtitle:
return "PGS"
default:
return "Unknown"
}
}
// IsVideoCodec returns true if the codec type is a video codec.
func IsVideoCodec(ct CodecType) bool {
switch ct {
case CodecMPEG1Video, CodecMPEG2Video, CodecH264Video, CodecH265Video, CodecVC1Video:
return true
}
return false
}
// IsSubtitleCodec returns true if the codec type is a subtitle codec.
func IsSubtitleCodec(ct CodecType) bool {
return ct == CodecPGSSubtitle
}
// IsAudioCodec returns true if the codec type is an audio codec.
func IsAudioCodec(ct CodecType) bool {
switch ct {
case CodecAC3Audio, CodecEAC3Audio, CodecDTSAudio, CodecDTSHDAudio,
CodecTrueHDAudio, CodecLPCMAudio, CodecMPEGAudio, CodecAACaudio,
CodecFLACAudio, CodecOpusAudio:
return true
}
return false
}
// MKVCodecToType maps an MKV CodecID string to a CodecType.
func MKVCodecToType(codecID string) CodecType {
switch {
case codecID == "V_MPEG1":
return CodecMPEG1Video
case codecID == "V_MPEG2":
return CodecMPEG2Video
case codecID == "V_MPEG4/ISO/AVC":
return CodecH264Video
case codecID == "V_MPEGH/ISO/HEVC":
return CodecH265Video
case codecID == "V_MS/VFW/FOURCC":
// Could be VC-1 or other; can't determine without codec private data
return CodecUnknown
case codecID == "A_AC3":
return CodecAC3Audio
case codecID == "A_EAC3":
return CodecEAC3Audio
case codecID == "A_DTS":
return CodecDTSAudio
case strings.HasPrefix(codecID, "A_DTS/"):
// A_DTS/EXPRESS, A_DTS/LOSSLESS, etc.
return CodecDTSHDAudio
case codecID == "A_TRUEHD":
return CodecTrueHDAudio
case strings.HasPrefix(codecID, "A_PCM/"):
// A_PCM/INT/LIT, A_PCM/INT/BIG, A_PCM/FLOAT/IEEE
return CodecLPCMAudio
case strings.HasPrefix(codecID, "A_MPEG/"):
// A_MPEG/L2, A_MPEG/L3
return CodecMPEGAudio
case strings.HasPrefix(codecID, "A_AAC"):
// A_AAC, A_AAC/MPEG2/MAIN, etc.
return CodecAACaudio
case codecID == "A_FLAC":
return CodecFLACAudio
case codecID == "A_OPUS":
return CodecOpusAudio
case codecID == "S_HDMV/PGS":
return CodecPGSSubtitle
default:
return CodecUnknown
}
}
// SourceCodecs describes the codecs found in a source media.
type SourceCodecs struct {
VideoCodecs []CodecType
AudioCodecs []CodecType
SubtitleCodecs []CodecType
}
// CodecMismatch describes a detected codec mismatch between MKV and source.
type CodecMismatch struct {
TrackType string // "video" or "audio"
MKVCodecID string // e.g. "V_MPEG4/ISO/AVC"
MKVCodecType CodecType // resolved codec type
SourceCodecs []CodecType // codecs found in source for this track type
}
// DetectSourceCodecs determines what codecs are present in the source media.
// For DVD sources, it extracts codec info from the already-parsed MPEG-PS data.
// For Blu-ray sources, it performs a lightweight PMT scan of the first M2TS file.
func DetectSourceCodecs(index *Index) (*SourceCodecs, error) {
switch index.SourceType {
case TypeDVD:
return detectDVDCodecs(index)
case TypeBluray:
return detectBlurayCodecs(index)
default:
return nil, fmt.Errorf("unknown source type")
}
}
// DetectSourceCodecsFromDir performs a lightweight codec detection from a source
// directory without building the full hash index. This allows codec compatibility
// checks to run before the expensive indexing step.
//
// For Blu-ray sources, this scans the PMTs of all M2TS files of significant size
// (>10% of the largest) and unions their codecs. This is necessary because
// different episodes or playlist entries may reference different M2TS files with
// different audio tracks (e.g., a stereo AC3 track may only appear in certain
// episode M2TS files, not in the largest one).
func DetectSourceCodecsFromDir(sourceDir string) (*SourceCodecs, error) {
sourceType, err := DetectType(sourceDir)
if err != nil {
return nil, fmt.Errorf("detect source type: %w", err)
}
files, err := EnumerateMediaFiles(sourceDir, sourceType)
if err != nil {
return nil, fmt.Errorf("enumerate files: %w", err)
}
if len(files) == 0 {
return nil, fmt.Errorf("no media files found in %s", sourceDir)
}
// Stat all files to get sizes
type fileInfo struct {
relPath string
size int64
}
var infos []fileInfo
var largestSize int64
for _, f := range files {
fullPath := filepath.Join(sourceDir, f)
info, err := os.Stat(fullPath)
if err != nil {
continue
}
infos = append(infos, fileInfo{f, info.Size()})
if info.Size() > largestSize {
largestSize = info.Size()
}
}
if len(infos) == 0 {
return nil, fmt.Errorf("no accessible media files found")
}
switch sourceType {
case TypeBluray:
// Try CLPI metadata first for extracted Blu-ray directories.
if codecs, err := detectBlurayCodecsFromCLPIDir(sourceDir); err == nil {
return codecs, nil
}
// Fallback: scan PMT from M2TS data.
targets := make([]codecScanTarget, len(infos))
for i, fi := range infos {
targets[i] = codecScanTarget{
Path: filepath.Join(sourceDir, fi.relPath),
Size: fi.size,
}
}
return detectBlurayCodecsMulti(significantTargets(targets))
case TypeDVD:
// For DVDs, use the largest file (main feature)
var largestFile string
for _, fi := range infos {
if fi.size == largestSize {
largestFile = fi.relPath
break
}
}
return detectDVDCodecsFromFile(filepath.Join(sourceDir, largestFile))
default:
return nil, fmt.Errorf("unknown source type")
}
}
// CheckCodecCompatibility compares MKV track codecs against source codecs.
// Returns nil if all codecs are compatible, or a list of mismatches.
func CheckCodecCompatibility(tracks []mkv.Track, sourceCodecs *SourceCodecs) []CodecMismatch {
var mismatches []CodecMismatch
for _, track := range tracks {
ct := MKVCodecToType(track.CodecID)
if ct == CodecUnknown {
continue // Skip unknown codecs — no false alarms
}
if track.Type == mkv.TrackTypeVideo && IsVideoCodec(ct) {
if len(sourceCodecs.VideoCodecs) == 0 {
continue // No source video info available
}
if !codecFamilyMatch(ct, sourceCodecs.VideoCodecs) {
mismatches = append(mismatches, CodecMismatch{
TrackType: "video",
MKVCodecID: track.CodecID,
MKVCodecType: ct,
SourceCodecs: sourceCodecs.VideoCodecs,
})
}
} else if track.Type == mkv.TrackTypeAudio && IsAudioCodec(ct) {
if len(sourceCodecs.AudioCodecs) == 0 {
continue // No source audio info available
}
if !codecFamilyMatch(ct, sourceCodecs.AudioCodecs) {
mismatches = append(mismatches, CodecMismatch{
TrackType: "audio",
MKVCodecID: track.CodecID,
MKVCodecType: ct,
SourceCodecs: sourceCodecs.AudioCodecs,
})
}
} else if track.Type == mkv.TrackTypeSubtitle && IsSubtitleCodec(ct) {
if len(sourceCodecs.SubtitleCodecs) == 0 {
continue // No source subtitle info available
}
if !codecFamilyMatch(ct, sourceCodecs.SubtitleCodecs) {
mismatches = append(mismatches, CodecMismatch{
TrackType: "subtitle",
MKVCodecID: track.CodecID,
MKVCodecType: ct,
SourceCodecs: sourceCodecs.SubtitleCodecs,
})
}
}
}
return mismatches
}
// codecFamilyMatch checks if a codec type is compatible with any codec in the list.
// Uses family-based matching (e.g., DTS is compatible with DTS-HD).
func codecFamilyMatch(ct CodecType, sourceCodecs []CodecType) bool {
family := codecFamily(ct)
for _, sc := range sourceCodecs {
if codecFamily(sc) == family {
return true
}
}
return false
}
// codecFamily returns the codec family for family-based matching.
// Related codecs map to the same family value.
func codecFamily(ct CodecType) int {
switch ct {
case CodecMPEG1Video, CodecMPEG2Video:
return 1
case CodecH264Video:
return 2
case CodecH265Video:
return 3
case CodecVC1Video:
return 4
case CodecAC3Audio, CodecEAC3Audio:
return 10
case CodecDTSAudio, CodecDTSHDAudio:
return 11
case CodecTrueHDAudio:
return 12
case CodecLPCMAudio:
return 13
case CodecMPEGAudio:
return 14
case CodecAACaudio:
return 15
case CodecFLACAudio:
return 16
case CodecOpusAudio:
return 17
case CodecPGSSubtitle:
return 20
default:
return 0
}
}
// containsCodec checks if a codec type is already in the list.
func containsCodec(codecs []CodecType, ct CodecType) bool {
for _, c := range codecs {
if c == ct {
return true
}
}
return false
}
// codecScanTarget describes a file to scan for codec detection.
// Unlike isoFileExtent (which represents an ISO directory entry with an
// uppercase ISO filename), this is used for on-disk paths that may be
// M2TS files, ISOs, or other media files.
type codecScanTarget struct {
Path string // filesystem path
Size int64 // file size in bytes
}
// significantFiles returns the subset of ISO file extents whose size is at
// least 10% of the largest. Used for filtering M2TS/VOB entries within ISOs.
func significantFiles(files []isoFileExtent) []isoFileExtent {
var largestSize int64
for _, f := range files {
if f.Size > largestSize {
largestSize = f.Size
}
}
minSize := largestSize / 10
var result []isoFileExtent
for _, f := range files {
if f.Size >= minSize {
result = append(result, f)
}
}
return result
}
// significantTargets returns the subset of scan targets whose size is at
// least 10% of the largest. Used for filtering on-disk files for codec detection.
func significantTargets(targets []codecScanTarget) []codecScanTarget {
var largestSize int64
for _, t := range targets {
if t.Size > largestSize {
largestSize = t.Size
}
}
minSize := largestSize / 10
var result []codecScanTarget
for _, t := range targets {
if t.Size >= minSize {
result = append(result, t)
}
}
return result
}
// mergeSourceCodecs adds all codecs from src into dst, deduplicating.
func mergeSourceCodecs(dst, src *SourceCodecs) {
for _, c := range src.VideoCodecs {
if !containsCodec(dst.VideoCodecs, c) {
dst.VideoCodecs = append(dst.VideoCodecs, c)
}
}
for _, c := range src.AudioCodecs {
if !containsCodec(dst.AudioCodecs, c) {
dst.AudioCodecs = append(dst.AudioCodecs, c)
}
}
for _, c := range src.SubtitleCodecs {
if !containsCodec(dst.SubtitleCodecs, c) {
dst.SubtitleCodecs = append(dst.SubtitleCodecs, c)
}
}
}
package source
import (
"encoding/binary"
"fmt"
"os"
"strings"
)
// parseDVDIFOCodecs parses a VTS_xx_0.IFO file's VTS_MAT structure to extract
// video and audio codec information. The IFO file authoritatively declares
// every stream in the title set, unlike PES scanning which can miss streams
// that appear later in the VOB data.
//
// VTS_MAT layout (relevant offsets):
//
// 0x000-0x00B: "DVDVIDEO-VTS" identifier
// 0x200-0x201: VTS video attributes (2 bytes)
// 0x202-0x203: Number of VTS audio streams (2 bytes, big-endian)
// 0x204-0x243: VTS audio stream attributes (8 bytes each, max 8)
func parseDVDIFOCodecs(data []byte) (*SourceCodecs, error) {
if len(data) < 0x244 {
return nil, fmt.Errorf("IFO data too short (%d bytes)", len(data))
}
magic := string(data[0:12])
if magic != "DVDVIDEO-VTS" {
return nil, fmt.Errorf("not a VTS IFO file (magic: %q)", magic)
}
codecs := &SourceCodecs{}
// Video attributes at offset 0x200 (2 bytes, big-endian).
// Bits 15-14: video compression mode (0=MPEG-1, 1=MPEG-2).
videoAttr := binary.BigEndian.Uint16(data[0x200:0x202])
switch (videoAttr >> 14) & 0x03 {
case 0:
codecs.VideoCodecs = append(codecs.VideoCodecs, CodecMPEG1Video)
case 1:
codecs.VideoCodecs = append(codecs.VideoCodecs, CodecMPEG2Video)
}
// Audio stream count at offset 0x202 (2 bytes, big-endian).
numAudio := int(binary.BigEndian.Uint16(data[0x202:0x204]))
numAudio = min(numAudio, 8)
// Audio attributes at offset 0x204 (8 bytes each).
// Byte 0, bits 7-5: audio coding mode.
for i := 0; i < numAudio; i++ {
off := 0x204 + i*8
// Skip all-zero entries (unused slots)
if data[off] == 0 && data[off+1] == 0 {
continue
}
codingMode := (data[off] >> 5) & 0x07
var ct CodecType
switch codingMode {
case 0:
ct = CodecAC3Audio
case 2, 3:
ct = CodecMPEGAudio // MPEG-1 and MPEG-2ext
case 4:
ct = CodecLPCMAudio
case 6:
ct = CodecDTSAudio
default:
continue
}
if !containsCodec(codecs.AudioCodecs, ct) {
codecs.AudioCodecs = append(codecs.AudioCodecs, ct)
}
}
return codecs, nil
}
// findIFOsInISO navigates an ISO9660 filesystem to find VTS IFO files
// (VTS_xx_0.IFO) under the VIDEO_TS directory. Returns nil if navigation fails.
func findIFOsInISO(f *os.File) []isoFileExtent {
rootLBA, rootLen, err := readISOPVDRoot(f)
if err != nil {
return nil
}
rootEntries, err := readISODirectory(f, rootLBA, rootLen)
if err != nil {
return nil
}
videoTS, err := findISOEntry(rootEntries, "VIDEO_TS")
if err != nil {
return nil
}
vtsEntries, err := readISODirectory(f, uint32(videoTS.Offset/isoSectorSize), uint32(videoTS.Size))
if err != nil {
return nil
}
var ifos []isoFileExtent
for _, e := range vtsEntries {
if e.IsDir {
continue
}
name := e.Name
// Match VTS_xx_0.IFO pattern (e.g., VTS_01_0.IFO)
if strings.HasPrefix(name, "VTS_") && strings.HasSuffix(name, ".IFO") &&
len(name) == 12 && name[7] == '0' {
ifos = append(ifos, e)
}
}
return ifos
}
// findIFOsInUDF navigates a UDF filesystem to find VTS IFO files under VIDEO_TS.
func findIFOsInUDF(f *os.File) ([]isoFileExtent, error) {
ctx, err := newUDFContext(f)
if err != nil {
return nil, err
}
rootFIDs, err := ctx.readDirectoryFromFE(ctx.rootFE)
if err != nil {
return nil, fmt.Errorf("read UDF root directory: %w", err)
}
vtsFE, err := ctx.lookupDir(rootFIDs, "VIDEO_TS")
if err != nil {
return nil, fmt.Errorf("find VIDEO_TS: %w", err)
}
vtsFIDs, err := ctx.readDirectoryFromFE(vtsFE)
if err != nil {
return nil, fmt.Errorf("read VIDEO_TS directory: %w", err)
}
var ifos []isoFileExtent
for _, fid := range vtsFIDs {
if fid.IsDir || fid.IsParent {
continue
}
name := strings.ToUpper(fid.Name)
if !strings.HasPrefix(name, "VTS_") || !strings.HasSuffix(name, ".IFO") {
continue
}
if len(name) != 12 || name[7] != '0' {
continue
}
fe, err := ctx.readFileEntryAt(fid.ICBLocation)
if err != nil || fe.InfoLength == 0 {
continue
}
extents, err := ctx.resolveAllExtents(fe)
if err != nil || len(extents) == 0 {
continue
}
ifo := isoFileExtent{
Name: name,
Offset: extents[0].ISOOffset,
Size: int64(fe.InfoLength),
IsDir: false,
}
if !extentsContiguous(extents) {
ifo.Extents = extents
}
ifos = append(ifos, ifo)
}
if len(ifos) == 0 {
return nil, fmt.Errorf("no VTS IFO files found in UDF VIDEO_TS/")
}
return ifos, nil
}
// detectDVDCodecsFromIFOs reads IFO files from within an ISO and returns
// the unioned codec information from all title sets.
func detectDVDCodecsFromIFOs(f *os.File, ifos []isoFileExtent) (*SourceCodecs, error) {
merged := &SourceCodecs{}
var lastErr error
anySuccess := false
for _, ifo := range ifos {
// We only need the first 0x244 bytes for VTS_MAT parsing.
const maxIFOReadSize int64 = 0x244
data, err := readISOFileExtent(f, ifo, maxIFOReadSize)
if err != nil {
lastErr = err
continue
}
codecs, err := parseDVDIFOCodecs(data)
if err != nil {
lastErr = err
continue
}
mergeSourceCodecs(merged, codecs)
anySuccess = true
}
if !anySuccess {
if lastErr != nil {
return nil, fmt.Errorf("failed to parse any VTS IFO: %w", lastErr)
}
return nil, fmt.Errorf("no valid VTS IFO files found")
}
return merged, nil
}
package source
import "fmt"
// binarySearchRanges performs binary search on PES payload ranges to find the one
// containing the given ES offset. Returns the index, or -1 if not found.
func binarySearchRanges(ranges []PESPayloadRange, esOffset int64) int {
if len(ranges) == 0 {
return -1
}
low, high := 0, len(ranges)-1
for low <= high {
mid := (low + high) / 2
r := ranges[mid]
if esOffset < r.ESOffset {
high = mid - 1
} else if esOffset >= r.ESOffset+int64(r.Size) {
low = mid + 1
} else {
return mid
}
}
return -1
}
// readByteAt reads a single byte from data or multiRegion at the given file offset.
func readByteAt(data []byte, mr *multiRegionData, fileOffset int64) byte {
if mr != nil {
return mr.ByteAt(fileOffset)
}
return data[fileOffset]
}
// readByteWithHint reads a single byte from a set of PES payload ranges using a hint
// for O(1) sequential access. Returns the byte, the range index for the next hint,
// and success status. Pass rangeHint=-1 to force binary search.
// When mr is non-nil, byte reads use the multi-region data instead of data.
func readByteWithHint(data []byte, mr *multiRegionData, dataSize int64, ranges []PESPayloadRange, esOffset int64, rangeHint int) (byte, int, bool) {
if len(ranges) == 0 {
return 0, -1, false
}
// Fast path: check if hint is still valid (O(1) check)
if rangeHint >= 0 && rangeHint < len(ranges) {
r := ranges[rangeHint]
if esOffset >= r.ESOffset && esOffset < r.ESOffset+int64(r.Size) {
offsetInPayload := esOffset - r.ESOffset
fileOffset := r.FileOffset + offsetInPayload
if fileOffset >= 0 && fileOffset < dataSize {
return readByteAt(data, mr, fileOffset), rangeHint, true
}
}
// Check next range (common case when crossing boundaries forward)
if rangeHint+1 < len(ranges) {
r = ranges[rangeHint+1]
if esOffset >= r.ESOffset && esOffset < r.ESOffset+int64(r.Size) {
offsetInPayload := esOffset - r.ESOffset
fileOffset := r.FileOffset + offsetInPayload
if fileOffset >= 0 && fileOffset < dataSize {
return readByteAt(data, mr, fileOffset), rangeHint + 1, true
}
}
}
// Check previous range (common case when crossing boundaries backward)
if rangeHint-1 >= 0 {
r = ranges[rangeHint-1]
if esOffset >= r.ESOffset && esOffset < r.ESOffset+int64(r.Size) {
offsetInPayload := esOffset - r.ESOffset
fileOffset := r.FileOffset + offsetInPayload
if fileOffset >= 0 && fileOffset < dataSize {
return readByteAt(data, mr, fileOffset), rangeHint - 1, true
}
}
}
}
// Slow path: binary search
rangeIdx := binarySearchRanges(ranges, esOffset)
if rangeIdx < 0 {
return 0, -1, false
}
r := ranges[rangeIdx]
offsetInPayload := esOffset - r.ESOffset
fileOffset := r.FileOffset + offsetInPayload
if fileOffset >= 0 && fileOffset < dataSize {
return readByteAt(data, mr, fileOffset), rangeIdx, true
}
return 0, -1, false
}
// readSliceAt reads a byte slice from data or multiRegion at the given file offset range.
func readSliceAt(data []byte, mr *multiRegionData, fileOffset, endOffset int64) []byte {
if mr != nil {
return mr.Slice(fileOffset, endOffset)
}
return data[fileOffset:endOffset]
}
// readFromRanges reads data from PES payload ranges starting at the given ES offset.
// Returns a zero-copy slice when data fits in a single range (common case),
// only copies when data spans multiple ranges.
// When mr is non-nil, data reads use the multi-region data instead of data.
func readFromRanges(data []byte, mr *multiRegionData, dataSize int64, ranges []PESPayloadRange, esOffset int64, size int) ([]byte, error) {
if len(ranges) == 0 {
return nil, fmt.Errorf("no ranges available")
}
// Use binary search to find starting range
rangeIdx := binarySearchRanges(ranges, esOffset)
if rangeIdx < 0 {
rangeIdx = 0
for rangeIdx < len(ranges) && esOffset >= ranges[rangeIdx].ESOffset+int64(ranges[rangeIdx].Size) {
rangeIdx++
}
}
if rangeIdx >= len(ranges) {
return nil, fmt.Errorf("ES offset %d not found in ranges", esOffset)
}
r := ranges[rangeIdx]
if esOffset < r.ESOffset || esOffset >= r.ESOffset+int64(r.Size) {
return nil, fmt.Errorf("ES offset %d not in range [%d, %d)", esOffset, r.ESOffset, r.ESOffset+int64(r.Size))
}
offsetInPayload := esOffset - r.ESOffset
availableInRange := int64(r.Size) - offsetInPayload
// Fast path: data fits entirely within this single range (zero-copy)
if int64(size) <= availableInRange {
fileOffset := r.FileOffset + offsetInPayload
endOffset := fileOffset + int64(size)
if endOffset > dataSize {
return nil, fmt.Errorf("file offset out of range")
}
return readSliceAt(data, mr, fileOffset, endOffset), nil
}
// Slow path: data spans multiple ranges — must copy
result := make([]byte, 0, size)
remaining := size
for remaining > 0 && rangeIdx < len(ranges) {
r := ranges[rangeIdx]
if esOffset < r.ESOffset {
break
}
if esOffset >= r.ESOffset+int64(r.Size) {
rangeIdx++
continue
}
offsetInPayload := esOffset - r.ESOffset
availableInRange := int64(r.Size) - offsetInPayload
toRead := remaining
if int64(toRead) > availableInRange {
toRead = int(availableInRange)
}
fileOffset := r.FileOffset + offsetInPayload
endOffset := fileOffset + int64(toRead)
if endOffset > dataSize {
if len(result) > 0 {
return result, nil
}
return nil, fmt.Errorf("failed to read ES data: offset out of range")
}
result = append(result, readSliceAt(data, mr, fileOffset, endOffset)...)
esOffset += int64(toRead)
remaining -= toRead
rangeIdx++
}
return result, nil
}
// rawRangesFromPESRanges enumerates raw file ranges for a given ES region.
func rawRangesFromPESRanges(ranges []PESPayloadRange, esOffset int64, size int) ([]RawRange, error) {
if len(ranges) == 0 {
return nil, fmt.Errorf("no ranges available")
}
// Use binary search to find starting range
rangeIdx := binarySearchRanges(ranges, esOffset)
if rangeIdx < 0 {
rangeIdx = 0
for rangeIdx < len(ranges) && esOffset >= ranges[rangeIdx].ESOffset+int64(ranges[rangeIdx].Size) {
rangeIdx++
}
}
if rangeIdx >= len(ranges) {
return nil, fmt.Errorf("ES offset %d not found in ranges", esOffset)
}
r := ranges[rangeIdx]
if esOffset < r.ESOffset || esOffset >= r.ESOffset+int64(r.Size) {
return nil, fmt.Errorf("ES offset %d not in range [%d, %d)", esOffset, r.ESOffset, r.ESOffset+int64(r.Size))
}
var result []RawRange
remaining := size
for remaining > 0 && rangeIdx < len(ranges) {
r := ranges[rangeIdx]
if esOffset < r.ESOffset {
break
}
if esOffset >= r.ESOffset+int64(r.Size) {
rangeIdx++
continue
}
offsetInPayload := esOffset - r.ESOffset
availableInRange := int64(r.Size) - offsetInPayload
toTake := remaining
if int64(toTake) > availableInRange {
toTake = int(availableInRange)
}
fileOffset := r.FileOffset + offsetInPayload
result = append(result, RawRange{
FileOffset: fileOffset,
Size: toTake,
})
esOffset += int64(toTake)
remaining -= toTake
rangeIdx++
}
if remaining > 0 {
return nil, fmt.Errorf("could not map entire ES region: %d bytes remaining", remaining)
}
return result, nil
}
// totalESSizeFromRanges returns the total ES size from a range list.
func totalESSizeFromRanges(ranges []PESPayloadRange) int64 {
if len(ranges) == 0 {
return 0
}
last := ranges[len(ranges)-1]
return last.ESOffset + int64(last.Size)
}
package source
import (
"fmt"
"github.com/cespare/xxhash/v2"
"golang.org/x/sys/unix"
)
// Lookup finds locations in the source that match the given hash.
func (idx *Index) Lookup(hash uint64) []Location {
return idx.HashToLocations[hash]
}
// ReadESDataAt reads ES data at the given location.
// For sources that use ES offsets, this handles the translation.
// For audio locations, uses the sub-stream ID from the location.
func (idx *Index) ReadESDataAt(loc Location, size int) ([]byte, error) {
if int(loc.FileIndex) >= len(idx.ESReaders) || idx.ESReaders[loc.FileIndex] == nil {
// No ES reader - this shouldn't happen for ES-based indexes
return nil, fmt.Errorf("no ES reader for file %d", loc.FileIndex)
}
if loc.IsVideo {
return idx.ESReaders[loc.FileIndex].ReadESData(loc.Offset, size, true)
}
// For audio, use the sub-stream specific reader
return idx.ESReaders[loc.FileIndex].ReadAudioSubStreamData(loc.AudioSubStreamID, loc.Offset, size)
}
// hintedESReader is the interface for hint-based byte reading.
// Both MPEGPSParser and MPEGTSParser implement this.
type hintedESReader interface {
ReadESByteWithHint(esOffset int64, isVideo bool, rangeHint int) (byte, int, bool)
ReadAudioByteWithHint(subStreamID byte, esOffset int64, rangeHint int) (byte, int, bool)
}
// ReadESByteWithHint reads a single byte from the ES stream, using a range hint
// to avoid binary search when reading sequentially. Returns the byte, the new range
// hint for the next call, and success status. Pass rangeHint=-1 to force binary search.
// This is optimized for the expandMatch hot path where we read bytes sequentially.
func (idx *Index) ReadESByteWithHint(loc Location, rangeHint int) (byte, int, bool) {
if int(loc.FileIndex) >= len(idx.ESReaders) || idx.ESReaders[loc.FileIndex] == nil {
return 0, -1, false
}
// Try hint-based reading (fast path for MPEGPSParser and MPEGTSParser)
if hinted, ok := idx.ESReaders[loc.FileIndex].(hintedESReader); ok {
if loc.IsVideo {
return hinted.ReadESByteWithHint(loc.Offset, true, rangeHint)
}
return hinted.ReadAudioByteWithHint(loc.AudioSubStreamID, loc.Offset, rangeHint)
}
// Fallback: use ReadESData (allocates, but works for any ESReader)
var data []byte
var err error
if loc.IsVideo {
data, err = idx.ESReaders[loc.FileIndex].ReadESData(loc.Offset, 1, true)
} else {
data, err = idx.ESReaders[loc.FileIndex].ReadAudioSubStreamData(loc.AudioSubStreamID, loc.Offset, 1)
}
if err != nil || len(data) == 0 {
return 0, -1, false
}
return data[0], -1, true
}
// ComputeHash calculates the xxhash of the given data.
func ComputeHash(data []byte) uint64 {
return xxhash.Sum64(data)
}
// AdviseForMatching sets madvise hints on source mmap'd files before matching.
// For raw-indexed sources (Blu-ray with raw offsets), sets MADV_SEQUENTIAL since
// locality-aware matching produces largely sequential access.
// For ES-indexed sources (DVD MPEG-PS, Blu-ray M2TS with ES offsets), the ES reader
// translates ES offsets to scattered positions in the container file, so MADV_SEQUENTIAL
// would hurt. Uses MADV_NORMAL (default adaptive readahead) instead.
func (idx *Index) AdviseForMatching() {
if idx.UsesESOffsets {
// ES-based: access pattern in the raw file is not sequential
// (ES offsets map to scattered PES packets). Use normal adaptive readahead.
for _, mmapFile := range idx.MmapFiles {
if mmapFile != nil {
mmapFile.Advise(unix.MADV_NORMAL)
}
}
} else {
// Raw-indexed: locality-aware matching produces sequential access
for _, reader := range idx.RawReaders {
if rr, ok := reader.(*mmapRawReader); ok {
rr.mmapFile.Advise(unix.MADV_SEQUENTIAL)
}
}
}
}
// Close releases resources held by the index.
func (idx *Index) Close() error {
// Close all mmap files (these back the ESReaders and RawReaders)
for _, mmapFile := range idx.MmapFiles {
if mmapFile != nil {
mmapFile.Close()
}
}
// Close all raw readers (which also close their mmap files)
for _, reader := range idx.RawReaders {
if reader != nil {
reader.Close()
}
}
return nil
}
package source
import (
"fmt"
"io"
"path/filepath"
"strings"
"github.com/cespare/xxhash/v2"
"github.com/stuckj/mkvdup/internal/mmap"
)
const (
// DefaultWindowSize is the default number of bytes to hash at each sync point
DefaultWindowSize = 64
// MinWindowSize is the minimum allowed window size
MinWindowSize = 32
// MaxWindowSize is the maximum allowed window size
MaxWindowSize = 4096
)
// Indexer builds a hash index from source media files.
type Indexer struct {
sourceDir string
sourceType Type
windowSize int
index *Index
useRawIndexing bool // Force raw file indexing even for DVDs
verboseWriter io.Writer // Destination for diagnostic output (nil = disabled)
}
// NewIndexer creates a new Indexer for the given source directory.
func NewIndexer(sourceDir string, windowSize int) (*Indexer, error) {
return NewIndexerWithOptions(sourceDir, windowSize, false)
}
// NewIndexerWithOptions creates a new Indexer with additional options.
// useRawIndexing forces raw file indexing even for DVDs (useful for finding
// content from any title/stream in the ISO).
func NewIndexerWithOptions(sourceDir string, windowSize int, useRawIndexing bool) (*Indexer, error) {
sourceType, err := DetectType(sourceDir)
if err != nil {
return nil, fmt.Errorf("detect source type: %w", err)
}
if windowSize < MinWindowSize {
windowSize = MinWindowSize
}
if windowSize > MaxWindowSize {
windowSize = MaxWindowSize
}
return &Indexer{
sourceDir: sourceDir,
sourceType: sourceType,
windowSize: windowSize,
index: NewIndex(sourceDir, sourceType, windowSize),
useRawIndexing: useRawIndexing,
}, nil
}
// SourceType returns the detected source type.
func (idx *Indexer) SourceType() Type {
return idx.sourceType
}
// SetVerboseWriter sets the destination for diagnostic output during indexing.
// Pass nil to disable verbose output.
func (idx *Indexer) SetVerboseWriter(w io.Writer) {
idx.verboseWriter = w
}
// SourceDir returns the source directory path.
func (idx *Indexer) SourceDir() string {
return idx.sourceDir
}
// ProgressFunc is called during indexing to report progress.
// processed is the number of bytes processed so far, total is the total bytes to process.
type ProgressFunc func(processed, total int64)
// Build scans all media files and builds the hash index.
// If progress is non-nil, it will be called periodically to report progress.
func (idx *Indexer) Build(progress ProgressFunc) error {
files, err := EnumerateMediaFiles(idx.sourceDir, idx.sourceType)
if err != nil {
return fmt.Errorf("enumerate media files: %w", err)
}
if len(files) == 0 {
return fmt.Errorf("no media files found in %s", idx.sourceDir)
}
// Calculate total size for progress reporting
var totalSize int64
for _, relPath := range files {
fullPath := filepath.Join(idx.sourceDir, relPath)
size, err := GetFileInfo(fullPath)
if err != nil {
return fmt.Errorf("get file info for %s: %w", relPath, err)
}
totalSize += size
}
// Pre-allocate hash map to reduce reallocation
// Estimate: ~1 sync point per 2KB of data on average
estimatedSyncPoints := int(totalSize / 2048)
if estimatedSyncPoints < 10000 {
estimatedSyncPoints = 10000
}
idx.index.HashToLocations = make(map[uint64][]Location, estimatedSyncPoints)
// For DVDs (MPEG-PS) and Blu-rays (MPEG-TS), use ES-based indexing
// so the matcher works with continuous ES data.
// Raw indexing is available as fallback for DVDs.
if idx.sourceType == TypeDVD && !idx.useRawIndexing {
idx.index.UsesESOffsets = true
} else if idx.sourceType == TypeBluray {
idx.index.UsesESOffsets = true
}
var processedSize int64
// Process each file
// fileIndex tracks the next available index for source file entries.
// Most files produce one entry, but Blu-ray ISOs produce one per M2TS region.
fileIndex := 0
for _, relPath := range files {
fullPath := filepath.Join(idx.sourceDir, relPath)
size, err := GetFileInfo(fullPath)
if err != nil {
return fmt.Errorf("get file info for %s: %w", relPath, err)
}
var checksum uint64
if idx.sourceType == TypeDVD && !idx.useRawIndexing {
checksum, err = idx.indexMPEGPSFile(uint16(fileIndex), fullPath, size, func(fileProcessed int64) {
if progress != nil {
progress(processedSize+fileProcessed, totalSize)
}
})
} else if idx.sourceType == TypeBluray && isISOFile(relPath) {
// Blu-ray ISO: one ISO may contain multiple M2TS regions,
// each producing a separate source file entry.
var n int
n, _, err = idx.indexBlurayISOFile(uint16(fileIndex), fullPath, relPath, size, func(fileProcessed int64) {
if progress != nil {
progress(processedSize+fileProcessed, totalSize)
}
})
if err != nil {
return fmt.Errorf("index file %s: %w", relPath, err)
}
// indexBlurayISOFile already added source file entries
fileIndex += n
processedSize += size
continue
} else if idx.sourceType == TypeBluray {
checksum, err = idx.indexM2TSFile(uint16(fileIndex), fullPath, size, func(fileProcessed int64) {
if progress != nil {
progress(processedSize+fileProcessed, totalSize)
}
})
} else {
checksum, err = idx.indexRawFile(uint16(fileIndex), fullPath, size, func(fileProcessed int64) {
if progress != nil {
progress(processedSize+fileProcessed, totalSize)
}
})
}
if err != nil {
return fmt.Errorf("index file %s: %w", relPath, err)
}
idx.index.Files = append(idx.index.Files, File{
RelativePath: relPath,
Size: size,
Checksum: checksum,
})
fileIndex++
processedSize += size
}
return nil
}
// isISOFile returns true if the path has an .iso extension.
func isISOFile(path string) bool {
return strings.HasSuffix(strings.ToLower(path), ".iso")
}
// checksumWithProgress computes xxhash checksum of data in chunks, calling
// progress with the number of bytes processed so far after each chunk.
func checksumWithProgress(data []byte, progress func(int64)) uint64 {
hasher := xxhash.New()
const chunkSize = 16 * 1024 * 1024 // 16MB chunks
for offset := 0; offset < len(data); offset += chunkSize {
end := offset + chunkSize
if end > len(data) {
end = len(data)
}
hasher.Write(data[offset:end])
if progress != nil {
progress(int64(end))
}
}
return hasher.Sum64()
}
// indexMPEGPSFile processes an MPEG-PS file (DVD ISO) using ES-aware indexing.
// It extracts the elementary stream data and indexes sync points within it.
func (idx *Indexer) indexMPEGPSFile(fileIndex uint16, path string, size int64, progress func(int64)) (uint64, error) {
// Memory-map the file with zero-copy access
mmapFile, err := mmap.Open(path)
if err != nil {
return 0, fmt.Errorf("mmap open: %w", err)
}
// Note: Don't close mmapFile - it's stored in MmapFiles for later use
// Store the mmap file for cleanup
idx.index.MmapFiles = append(idx.index.MmapFiles, mmapFile)
// Parse MPEG-PS structure with progress reporting using zero-copy data
parser := NewMPEGPSParser(mmapFile.Data())
// Phase 1: Parse MPEG-PS structure (0% → 33%)
if err := parser.ParseWithProgress(func(processed, total int64) {
if progress != nil {
progress(processed / 3)
}
}); err != nil {
return 0, fmt.Errorf("parse MPEG-PS: %w", err)
}
// Store parser for later use by matcher
idx.index.ESReaders = append(idx.index.ESReaders, parser)
// Phase 2: Checksum (33% → 66%)
checksum := checksumWithProgress(mmapFile.Data(), func(processed int64) {
if progress != nil {
progress(size/3 + processed/3)
}
})
// Phase 3: Index ES data (66% → 100%)
videoESSize := parser.TotalESSize(true)
if videoESSize > 0 {
indexProgress := func(fileOffset int64) {
if progress != nil {
progress(2*size/3 + fileOffset/3)
}
}
if err := idx.indexESData(fileIndex, parser, true, videoESSize, indexProgress); err != nil {
return 0, fmt.Errorf("index video ES: %w", err)
}
}
// Index each audio sub-stream separately
audioSubStreams := parser.AudioSubStreams()
for _, subStreamID := range audioSubStreams {
subStreamSize := parser.AudioSubStreamESSize(subStreamID)
if subStreamSize > 0 {
if parser.IsLPCMSubStream(subStreamID) {
// LPCM has no natural sync patterns; use fixed-interval sync points.
// The indexer forces the slow path (ReadAudioSubStreamData) for LPCM
// so the data goes through the byte-swap transform.
if err := idx.indexSubStream(fileIndex, parser, subStreamID, subStreamSize, FindLPCMIndexSyncPoints); err != nil {
return 0, fmt.Errorf("index LPCM sub-stream 0x%02X: %w", subStreamID, err)
}
} else {
if err := idx.indexAudioSubStream(fileIndex, parser, subStreamID, subStreamSize); err != nil {
return 0, fmt.Errorf("index audio sub-stream 0x%02X: %w", subStreamID, err)
}
}
}
}
if progress != nil {
progress(size)
}
return checksum, nil
}
// Index returns the built index. Must call Build first.
func (idx *Indexer) Index() *Index {
return idx.index
}
package source
import (
"fmt"
"github.com/stuckj/mkvdup/internal/mmap"
"golang.org/x/sys/unix"
)
// indexM2TSFile processes a Blu-ray M2TS file using ES-aware indexing.
// It parses the MPEG-TS structure to extract elementary stream data and
// indexes sync points within the continuous ES, matching what MKV files contain.
func (idx *Indexer) indexM2TSFile(fileIndex uint16, path string, size int64, progress func(int64)) (uint64, error) {
mmapFile, err := mmap.Open(path)
if err != nil {
return 0, fmt.Errorf("mmap open: %w", err)
}
// Note: Don't close mmapFile - it's stored in MmapFiles for later use
idx.index.MmapFiles = append(idx.index.MmapFiles, mmapFile)
mmapFile.Advise(unix.MADV_SEQUENTIAL)
// Phase 1: Parse MPEG-TS structure (0% → 33%)
parser := NewMPEGTSParser(mmapFile.Data())
if err := parser.ParseWithProgress(func(processed, total int64) {
if progress != nil {
progress(processed / 3)
}
}); err != nil {
return 0, fmt.Errorf("parse MPEG-TS: %w", err)
}
// Store parser for later use by matcher
idx.index.ESReaders = append(idx.index.ESReaders, parser)
// Phase 2: Checksum (33% → 66%)
checksum := checksumWithProgress(mmapFile.Data(), func(processed int64) {
if progress != nil {
progress(size/3 + processed/3)
}
})
// Phase 3: Index ES data (66% → 100%)
videoESSize := parser.TotalESSize(true)
if videoESSize > 0 {
indexProgress := func(fileOffset int64) {
if progress != nil {
progress(2*size/3 + fileOffset/3)
}
}
if err := idx.indexESData(fileIndex, parser, true, videoESSize, indexProgress); err != nil {
return 0, fmt.Errorf("index video ES: %w", err)
}
}
// Index each audio sub-stream separately
subtitleIDs := parser.SubtitleSubStreams()
subtitleSet := make(map[byte]bool, len(subtitleIDs))
for _, id := range subtitleIDs {
subtitleSet[id] = true
}
for _, subStreamID := range parser.AudioSubStreams() {
if subtitleSet[subStreamID] {
continue // indexed below with subtitle-specific sync points
}
subStreamSize := parser.AudioSubStreamESSize(subStreamID)
if subStreamSize > 0 {
if err := idx.indexAudioSubStream(fileIndex, parser, subStreamID, subStreamSize); err != nil {
return 0, fmt.Errorf("index audio sub-stream %d: %w", subStreamID, err)
}
}
}
// Index subtitle sub-streams with PGS sync point detection
for _, subStreamID := range subtitleIDs {
subStreamSize := parser.AudioSubStreamESSize(subStreamID)
if subStreamSize > 0 {
if err := idx.indexSubStream(fileIndex, parser, subStreamID, subStreamSize, FindPGSSyncPoints); err != nil {
return 0, fmt.Errorf("index subtitle sub-stream %d: %w", subStreamID, err)
}
}
}
if progress != nil {
progress(size)
}
return checksum, nil
}
// indexBlurayISOFile processes a Blu-ray ISO file by finding M2TS regions
// within the ISO9660 filesystem and indexing each as a separate source file entry.
// Returns the number of source file entries created and the ISO checksum.
func (idx *Indexer) indexBlurayISOFile(startFileIndex uint16, path, relPath string, size int64, progress func(int64)) (int, uint64, error) {
// Find M2TS file extents within the ISO
m2tsFiles, err := findBlurayM2TSInISO(path)
if err != nil {
return 0, 0, fmt.Errorf("find M2TS in ISO: %w", err)
}
if len(m2tsFiles) == 0 {
return 0, 0, fmt.Errorf("no M2TS files found in Blu-ray ISO")
}
// Memory-map the entire ISO
mmapFile, err := mmap.Open(path)
if err != nil {
return 0, 0, fmt.Errorf("mmap open: %w", err)
}
// Don't close — stored in MmapFiles for later use
idx.index.MmapFiles = append(idx.index.MmapFiles, mmapFile)
mmapFile.Advise(unix.MADV_SEQUENTIAL)
isoData := mmapFile.Data()
// Phase 1: Parse all M2TS regions (0% → 33%)
type parsedM2TS struct {
adapter *isoM2TSAdapter
extent isoFileExtent
}
var parsed []parsedM2TS
for _, m2ts := range m2tsFiles {
var adapter *isoM2TSAdapter
if m2ts.Extents != nil {
// Multi-extent UDF file: create virtual contiguous view
// over the existing mmap sub-slices (zero-copy, no heap allocation)
mr := newMultiRegionData(m2ts.Extents, isoData)
parser := NewMPEGTSParserMultiRegion(mr)
if err := parser.ParseWithProgress(nil); err != nil {
if idx.verboseWriter != nil {
fmt.Fprintf(idx.verboseWriter, " [indexBlurayISO] skipping %s: %v\n", m2ts.Name, err)
}
continue
}
adapter = newISOAdapterMultiExtent(parser, mr, m2ts.Extents)
} else {
// Contiguous file: use sub-slice of mmap'd ISO
endOffset := m2ts.Offset + m2ts.Size
if endOffset > int64(len(isoData)) {
if idx.verboseWriter != nil {
fmt.Fprintf(idx.verboseWriter, " [indexBlurayISO] skipping %s: extent beyond ISO bounds (%d + %d > %d)\n",
m2ts.Name, m2ts.Offset, m2ts.Size, len(isoData))
}
continue
}
m2tsData := isoData[m2ts.Offset:endOffset]
parser := NewMPEGTSParser(m2tsData)
if err := parser.ParseWithProgress(nil); err != nil {
if idx.verboseWriter != nil {
fmt.Fprintf(idx.verboseWriter, " [indexBlurayISO] skipping %s: %v\n", m2ts.Name, err)
}
continue
}
adapter = newISOAdapter(parser, isoData, m2ts.Offset)
}
parsed = append(parsed, parsedM2TS{adapter: adapter, extent: m2ts})
}
if len(parsed) == 0 {
return 0, 0, fmt.Errorf("no valid M2TS streams found in Blu-ray ISO")
}
if progress != nil {
progress(size / 3)
}
// Phase 2: Checksum the full ISO (33% → 66%)
checksum := checksumWithProgress(isoData, func(processed int64) {
if progress != nil {
progress(size/3 + processed/3)
}
})
// Phase 3: Index ES data from all M2TS regions (66% → 100%)
entriesCreated := 0
for _, p := range parsed {
fileIndex := startFileIndex + uint16(entriesCreated)
adapter := p.adapter
// Store adapter as ESReader for this source file entry
idx.index.ESReaders = append(idx.index.ESReaders, adapter)
// Index video ES
videoESSize := adapter.TotalESSize(true)
if videoESSize > 0 {
if err := idx.indexESData(fileIndex, adapter, true, videoESSize, nil); err != nil {
return 0, 0, fmt.Errorf("index video ES for %s: %w", p.extent.Name, err)
}
}
// Index audio sub-streams
subtitleIDs := adapter.parser.SubtitleSubStreams()
subtitleSet := make(map[byte]bool, len(subtitleIDs))
for _, id := range subtitleIDs {
subtitleSet[id] = true
}
for _, subStreamID := range adapter.AudioSubStreams() {
if subtitleSet[subStreamID] {
continue
}
subStreamSize := adapter.AudioSubStreamESSize(subStreamID)
if subStreamSize > 0 {
if err := idx.indexAudioSubStream(fileIndex, adapter, subStreamID, subStreamSize); err != nil {
return 0, 0, fmt.Errorf("index audio sub-stream %d for %s: %w", subStreamID, p.extent.Name, err)
}
}
}
// Index subtitle sub-streams
for _, subStreamID := range subtitleIDs {
subStreamSize := adapter.AudioSubStreamESSize(subStreamID)
if subStreamSize > 0 {
if err := idx.indexSubStream(fileIndex, adapter, subStreamID, subStreamSize, FindPGSSyncPoints); err != nil {
return 0, 0, fmt.Errorf("index subtitle sub-stream %d for %s: %w", subStreamID, p.extent.Name, err)
}
}
}
// Add source file entry — all entries share the same ISO path, size, checksum
idx.index.Files = append(idx.index.Files, File{
RelativePath: relPath,
Size: size,
Checksum: checksum,
})
entriesCreated++
}
if progress != nil {
progress(size)
}
return entriesCreated, checksum, nil
}
package source
import (
"fmt"
"github.com/cespare/xxhash/v2"
)
// esDataProvider is the interface needed by indexESData and indexAudioSubStream.
// Both MPEGPSParser and MPEGTSParser implement this, as well as isoM2TSAdapter.
type esDataProvider interface {
Data() []byte
DataSlice(off int64, size int) []byte
DataSize() int64
FilteredVideoRanges() []PESPayloadRange
FilteredAudioRanges(subStreamID byte) []PESPayloadRange
ReadESData(esOffset int64, size int, isVideo bool) ([]byte, error)
ReadAudioSubStreamData(subStreamID byte, esOffset int64, size int) ([]byte, error)
IsLPCMSubStream(subStreamID byte) bool
}
// indexESData indexes the elementary stream data from an ES-aware parser.
// Uses zero-copy iteration through PES payload ranges.
func (idx *Indexer) indexESData(fileIndex uint16, parser esDataProvider, isVideo bool, esSize int64, progress func(int64)) error {
ranges := parser.FilteredVideoRanges()
if len(ranges) == 0 {
return nil
}
dataSize := parser.DataSize()
syncPointCount := 0
var indexFastPath, indexSlowPath, indexSkipped int
// Iterate through each PES payload range (zero-copy when within one region)
for rangeIdx, r := range ranges {
endOffset := r.FileOffset + int64(r.Size)
if endOffset > dataSize {
continue
}
rangeData := parser.DataSlice(r.FileOffset, r.Size)
// Find NAL unit start positions (byte after 00 00 01)
// Hashing from NAL header enables matching both Annex B and AVCC formats
syncPoints := FindVideoNALStarts(rangeData)
// Add each sync point to the index
for _, offsetInRange := range syncPoints {
syncESOffset := r.ESOffset + int64(offsetInRange)
// Ensure we have enough data for the window
if syncESOffset+int64(idx.windowSize) > esSize {
continue
}
// Check if window fits within this range (zero-copy fast path)
if offsetInRange+idx.windowSize <= len(rangeData) {
window := rangeData[offsetInRange : offsetInRange+idx.windowSize]
hash := xxhash.Sum64(window)
idx.index.HashToLocations[hash] = append(idx.index.HashToLocations[hash], Location{
FileIndex: fileIndex,
Offset: syncESOffset,
IsVideo: isVideo,
})
syncPointCount++
indexFastPath++
} else {
// Window spans range boundary - use ReadESData (may copy)
window, err := parser.ReadESData(syncESOffset, idx.windowSize, isVideo)
if err != nil || len(window) < idx.windowSize {
indexSkipped++
continue
}
hash := xxhash.Sum64(window)
idx.index.HashToLocations[hash] = append(idx.index.HashToLocations[hash], Location{
FileIndex: fileIndex,
Offset: syncESOffset,
IsVideo: isVideo,
})
syncPointCount++
indexSlowPath++
}
}
// Report progress periodically
if rangeIdx%10000 == 0 && progress != nil {
progress(r.FileOffset)
}
}
if idx.verboseWriter != nil {
fmt.Fprintf(idx.verboseWriter, " [indexESData] video=%v: %d NALs indexed (fast=%d, slow/cross-range=%d, skipped=%d)\n",
isVideo, syncPointCount, indexFastPath, indexSlowPath, indexSkipped)
}
return nil
}
// syncPointFinder is a function that returns sync point offsets within data.
type syncPointFinder func(data []byte) []int
// indexAudioSubStream indexes a specific audio sub-stream.
func (idx *Indexer) indexAudioSubStream(fileIndex uint16, parser esDataProvider, subStreamID byte, esSize int64) error {
return idx.indexSubStream(fileIndex, parser, subStreamID, esSize, FindAudioSyncPoints)
}
// indexSubStream indexes a specific sub-stream using the provided sync point finder.
// Uses zero-copy iteration through PES payload ranges.
// For LPCM sub-streams, always uses the slow path (ReadAudioSubStreamData) because
// the raw data is big-endian but the read method returns byte-swapped little-endian data.
func (idx *Indexer) indexSubStream(fileIndex uint16, parser esDataProvider, subStreamID byte, esSize int64, findSyncPoints syncPointFinder) error {
ranges := parser.FilteredAudioRanges(subStreamID)
if len(ranges) == 0 {
return nil
}
dataSize := parser.DataSize()
isLPCM := parser.IsLPCMSubStream(subStreamID)
// Iterate through each PES payload range (zero-copy when within one region)
for _, r := range ranges {
endOffset := r.FileOffset + int64(r.Size)
if endOffset > dataSize {
continue
}
rangeData := parser.DataSlice(r.FileOffset, r.Size)
// Find sync points in this range (uses raw data — LPCM sync points
// are fixed-interval so data content doesn't matter)
syncPoints := findSyncPoints(rangeData)
// Add each sync point to the index
for _, offsetInRange := range syncPoints {
syncESOffset := r.ESOffset + int64(offsetInRange)
// Ensure we have enough data for the window
if syncESOffset+int64(idx.windowSize) > esSize {
continue
}
// For LPCM, always use ReadAudioSubStreamData which applies the transform.
// For non-LPCM, use the zero-copy fast path when possible.
if !isLPCM && offsetInRange+idx.windowSize <= len(rangeData) {
window := rangeData[offsetInRange : offsetInRange+idx.windowSize]
hash := xxhash.Sum64(window)
idx.index.HashToLocations[hash] = append(idx.index.HashToLocations[hash], Location{
FileIndex: fileIndex,
Offset: syncESOffset,
IsVideo: false,
AudioSubStreamID: subStreamID,
})
} else {
// Window spans range boundary or LPCM - use ReadAudioSubStreamData
window, err := parser.ReadAudioSubStreamData(subStreamID, syncESOffset, idx.windowSize)
if err != nil || len(window) < idx.windowSize {
continue
}
hash := xxhash.Sum64(window)
idx.index.HashToLocations[hash] = append(idx.index.HashToLocations[hash], Location{
FileIndex: fileIndex,
Offset: syncESOffset,
IsVideo: false,
AudioSubStreamID: subStreamID,
})
}
}
}
return nil
}
package source
import (
"fmt"
"github.com/cespare/xxhash/v2"
"github.com/stuckj/mkvdup/internal/mmap"
"golang.org/x/sys/unix"
)
// mmapRawReader wraps mmap.File to implement RawReader interface.
type mmapRawReader struct {
mmapFile *mmap.File
}
func (r *mmapRawReader) ReadAt(buf []byte, offset int64) (int, error) {
data := r.mmapFile.Slice(offset, len(buf))
if data == nil {
return 0, fmt.Errorf("offset out of range")
}
copy(buf, data)
return len(data), nil
}
// Slice returns a zero-copy slice of the underlying mmap'd data.
func (r *mmapRawReader) Slice(offset int64, size int) []byte {
return r.mmapFile.Slice(offset, size)
}
func (r *mmapRawReader) Len() int {
return r.mmapFile.Len()
}
func (r *mmapRawReader) Close() error {
return r.mmapFile.Close()
}
// indexRawFile processes a raw file (for non-DVD, non-Blu-ray formats).
// Processes the file in a single pass: computes checksum and indexes sync points
// together in chunks, releasing mmap pages as they're processed.
func (idx *Indexer) indexRawFile(fileIndex uint16, path string, size int64, progress func(int64)) (uint64, error) {
mmapFile, err := mmap.Open(path)
if err != nil {
return 0, fmt.Errorf("mmap open: %w", err)
}
idx.index.RawReaders = append(idx.index.RawReaders, &mmapRawReader{mmapFile: mmapFile})
mmapFile.Advise(unix.MADV_SEQUENTIAL)
data := mmapFile.Data()
return idx.indexRawFileData(fileIndex, mmapFile, data, size, progress)
}
// indexRawFileData is the core of indexRawFile operating on already-opened mmap data.
// Used as a fallback when M2TS packet structure cannot be detected.
func (idx *Indexer) indexRawFileData(fileIndex uint16, mmapFile *mmap.File, data []byte, size int64, progress func(int64)) (uint64, error) {
hasher := xxhash.New()
const chunkSize = 64 * 1024 * 1024
const overlap = 3
pageSize := unix.Getpagesize()
checksumPos := 0
for chunkStart := 0; chunkStart < len(data); {
chunkEnd := chunkStart + chunkSize
if chunkEnd > len(data) {
chunkEnd = len(data)
}
chunk := data[chunkStart:chunkEnd]
if chunkEnd > checksumPos {
hasher.Write(data[checksumPos:chunkEnd])
checksumPos = chunkEnd
}
videoOffsets := FindVideoNALStartsInRange(chunk, chunkStart)
audioOffsets := FindAudioSyncPointsInRange(chunk, chunkStart)
for _, offset := range videoOffsets {
if offset+idx.windowSize > len(data) {
continue
}
window := data[offset : offset+idx.windowSize]
hash := xxhash.Sum64(window)
idx.index.HashToLocations[hash] = append(idx.index.HashToLocations[hash], Location{
FileIndex: fileIndex,
Offset: int64(offset),
})
}
for _, offset := range audioOffsets {
if offset+idx.windowSize > len(data) {
continue
}
window := data[offset : offset+idx.windowSize]
hash := xxhash.Sum64(window)
idx.index.HashToLocations[hash] = append(idx.index.HashToLocations[hash], Location{
FileIndex: fileIndex,
Offset: int64(offset),
})
}
if progress != nil {
progress(int64(chunkEnd))
}
releaseUpTo := (chunkStart / pageSize) * pageSize
if releaseUpTo > 0 {
unix.Madvise(data[:releaseUpTo], unix.MADV_DONTNEED)
}
if chunkEnd >= len(data) {
break
}
chunkStart = chunkEnd - overlap
}
checksum := hasher.Sum64()
mmapFile.Advise(unix.MADV_RANDOM)
return checksum, nil
}
package source
import (
"errors"
"fmt"
"io"
"os"
"strings"
)
// errNotISO9660 is returned when the image lacks a valid ISO9660 PVD,
// signaling the caller to try an alternative filesystem (e.g. UDF).
var errNotISO9660 = errors.New("not an ISO9660 image")
const isoSectorSize = 2048
// isoFileExtent represents a file within an ISO9660 filesystem.
type isoFileExtent struct {
Name string // filename (uppercase, no version suffix)
Offset int64 // byte offset in ISO (first extent)
Size int64 // data length in bytes
IsDir bool // true if this is a directory entry
Extents []isoPhysicalRange // non-nil for multi-extent UDF files
}
// isoPhysicalRange describes one contiguous physical region within an ISO.
type isoPhysicalRange struct {
ISOOffset int64 // byte offset in the ISO file
Length int64 // number of bytes
}
// findBlurayM2TSInISO finds M2TS files under BDMV/STREAM/ in a Blu-ray ISO.
// Tries UDF first (native Blu-ray filesystem), falls back to ISO9660.
// UDF is preferred because ISO9660 has a 4 GB file size limit and cannot
// properly represent large M2TS files common on Blu-ray discs.
func findBlurayM2TSInISO(isoPath string) ([]isoFileExtent, error) {
f, err := os.Open(isoPath)
if err != nil {
return nil, err
}
defer f.Close()
// Try UDF first — Blu-ray's native filesystem, no file size limits.
udfFiles, udfErr := findBlurayM2TSInUDF(f)
if udfErr == nil && len(udfFiles) > 0 {
return udfFiles, nil
}
// Fall back to ISO9660 (some DVD-based ISOs or hybrid discs).
rootExtent, rootDataLen, err := readISOPVDRoot(f)
if err != nil {
if errors.Is(err, errNotISO9660) {
// No ISO9660 PVD found — report both failures if UDF also failed.
if udfErr != nil {
return nil, fmt.Errorf("neither UDF (%v) nor ISO9660 (%w) found", udfErr, err)
}
return nil, fmt.Errorf("read ISO PVD: %w", err)
}
// ISO9660 PVD exists but had a read/parse error — surface it directly.
if udfErr != nil {
return nil, fmt.Errorf("read ISO PVD: %w (UDF attempt also failed: %v)", err, udfErr)
}
return nil, fmt.Errorf("read ISO PVD: %w", err)
}
// Navigate: root → BDMV → STREAM
rootEntries, err := readISODirectory(f, rootExtent, rootDataLen)
if err != nil {
return nil, fmt.Errorf("read ISO root directory: %w", err)
}
bdmv, err := findISOEntry(rootEntries, "BDMV")
if err != nil {
return nil, fmt.Errorf("find BDMV directory: %w", err)
}
bdmvEntries, err := readISODirectory(f, uint32(bdmv.Offset/isoSectorSize), uint32(bdmv.Size))
if err != nil {
return nil, fmt.Errorf("read BDMV directory: %w", err)
}
stream, err := findISOEntry(bdmvEntries, "STREAM")
if err != nil {
return nil, fmt.Errorf("find STREAM directory: %w", err)
}
streamEntries, err := readISODirectory(f, uint32(stream.Offset/isoSectorSize), uint32(stream.Size))
if err != nil {
return nil, fmt.Errorf("read STREAM directory: %w", err)
}
// Collect M2TS files
var m2tsFiles []isoFileExtent
for _, e := range streamEntries {
if !e.IsDir && strings.HasSuffix(e.Name, ".M2TS") {
m2tsFiles = append(m2tsFiles, e)
}
}
return m2tsFiles, nil
}
// readISOPVDRoot reads the Primary Volume Descriptor and returns the root
// directory extent LBA and data length.
func readISOPVDRoot(f *os.File) (extentLBA uint32, dataLen uint32, err error) {
const pvdOffset = 16 * isoSectorSize
pvd := make([]byte, isoSectorSize)
if _, err := f.ReadAt(pvd, pvdOffset); err != nil {
return 0, 0, err
}
// Verify PVD: type=1, signature="CD001"
if pvd[0] != 1 || string(pvd[1:6]) != "CD001" {
return 0, 0, fmt.Errorf("%w: invalid primary volume descriptor", errNotISO9660)
}
// Root directory record at offset 156
root := pvd[156:]
if len(root) < 34 {
return 0, 0, fmt.Errorf("%w: root directory record too short", errNotISO9660)
}
extentLBA = uint32(root[2]) | uint32(root[3])<<8 |
uint32(root[4])<<16 | uint32(root[5])<<24
dataLen = uint32(root[10]) | uint32(root[11])<<8 |
uint32(root[12])<<16 | uint32(root[13])<<24
return extentLBA, dataLen, nil
}
// readISODirectory reads and parses an ISO9660 directory at the given extent.
func readISODirectory(f *os.File, extentLBA, dataLen uint32) ([]isoFileExtent, error) {
// Cap directory read to 256KB to avoid huge allocations
if dataLen > 256*1024 {
dataLen = 256 * 1024
}
dirData := make([]byte, dataLen)
if _, err := f.ReadAt(dirData, int64(extentLBA)*isoSectorSize); err != nil {
return nil, err
}
var entries []isoFileExtent
offset := 0
for offset < len(dirData) {
recLen := int(dirData[offset])
if recLen == 0 {
// Padding at end of sector — skip to next sector boundary
nextSector := ((offset / isoSectorSize) + 1) * isoSectorSize
if nextSector >= len(dirData) {
break
}
offset = nextSector
continue
}
if offset+recLen > len(dirData) {
break
}
if offset+33 > len(dirData) {
break
}
nameLen := int(dirData[offset+32])
if nameLen == 0 || offset+33+nameLen > len(dirData) {
offset += recLen
continue
}
name := string(dirData[offset+33 : offset+33+nameLen])
// Skip "." and ".." entries (single byte 0x00 or 0x01)
if nameLen == 1 && (name[0] == 0x00 || name[0] == 0x01) {
offset += recLen
continue
}
// Normalize: uppercase, strip version (";1") and trailing dot
name = strings.ToUpper(name)
if idx := strings.Index(name, ";"); idx >= 0 {
name = name[:idx]
}
name = strings.TrimSuffix(name, ".")
// Extract extent LBA (bytes 2-5, little-endian)
eLBA := uint32(dirData[offset+2]) | uint32(dirData[offset+3])<<8 |
uint32(dirData[offset+4])<<16 | uint32(dirData[offset+5])<<24
// Extract data length (bytes 10-13, little-endian)
eLen := uint32(dirData[offset+10]) | uint32(dirData[offset+11])<<8 |
uint32(dirData[offset+12])<<16 | uint32(dirData[offset+13])<<24
// File flags byte 25: bit 1 = directory
isDir := dirData[offset+25]&0x02 != 0
entries = append(entries, isoFileExtent{
Name: name,
Offset: int64(eLBA) * isoSectorSize,
Size: int64(eLen),
IsDir: isDir,
})
offset += recLen
}
return entries, nil
}
// findISOEntry finds a named directory entry (case-insensitive).
func findISOEntry(entries []isoFileExtent, name string) (*isoFileExtent, error) {
upper := strings.ToUpper(name)
for i := range entries {
if entries[i].Name == upper {
return &entries[i], nil
}
}
return nil, fmt.Errorf("%q not found", name)
}
// readISOFileExtent reads up to maxBytes from an isoFileExtent, handling both
// contiguous files (single ReadAt from Offset) and non-contiguous UDF files
// (stitching reads across Extents). Returns the data read and any error.
// io.EOF and io.ErrUnexpectedEOF are treated as non-fatal (partial read OK).
func readISOFileExtent(f *os.File, ext isoFileExtent, maxBytes int64) ([]byte, error) {
readSize := min(ext.Size, maxBytes)
if readSize <= 0 {
return nil, fmt.Errorf("file %s has non-positive size %d", ext.Name, ext.Size)
}
data := make([]byte, readSize)
if len(ext.Extents) == 0 {
// Contiguous: single read from Offset.
n, err := f.ReadAt(data, ext.Offset)
if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF {
return nil, err
}
return data[:n], nil
}
// Non-contiguous: stitch reads across physical extents.
var totalRead int
for _, pe := range ext.Extents {
if int64(totalRead) >= readSize {
break
}
remaining := int(readSize) - totalRead
chunkSize := min(int(pe.Length), remaining)
if chunkSize <= 0 {
continue
}
n, err := f.ReadAt(data[totalRead:totalRead+chunkSize], pe.ISOOffset)
totalRead += n
if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF {
if totalRead == 0 {
return nil, err
}
break // keep what we have
}
if n < chunkSize {
break // short read
}
}
if totalRead == 0 {
return nil, fmt.Errorf("no data read from %s extents", ext.Name)
}
return data[:totalRead], nil
}
package source
import "sort"
// isoM2TSAdapter wraps an MPEGTSParser to provide ISO-level integration
// for an M2TS region embedded within a Blu-ray ISO file. The parser operates
// on a sub-slice (contiguous) or virtual contiguous view (multi-extent) of
// the ISO data, producing FileOffset values relative to that view.
//
// The adapter handles two offset domains:
// - Parser-relative: used by FilteredVideoRanges (zero-copy from parser),
// DataSlice (adds baseOffset / resolves via multiRegionData internally),
// and all ES-offset-based reads.
// - ISO-relative: used by range maps stored in the dedup file. The
// FileOffsetConverter method provides the conversion function, applied
// lazily during range map encoding to avoid copying range arrays.
type isoM2TSAdapter struct {
parser *MPEGTSParser
isoData []byte // full ISO mmap data (contiguous case: used by Data/DataSlice)
baseOffset int64 // M2TS region start offset within the ISO
// For non-contiguous multi-extent files:
mr *multiRegionData // virtual contiguous view over mmap sub-slices
extentMap []extentMapEntry // maps logical offset → ISO offset
}
// extentMapEntry maps a range of logical (assembled) offsets to physical ISO offsets.
type extentMapEntry struct {
LogicalStart int64 // start offset in assembled data
ISOOffset int64 // corresponding offset in the ISO file
Length int64 // length of this extent
}
// newISOAdapter creates an adapter for an M2TS region within an ISO.
func newISOAdapter(parser *MPEGTSParser, isoData []byte, baseOffset int64) *isoM2TSAdapter {
return &isoM2TSAdapter{
parser: parser,
isoData: isoData,
baseOffset: baseOffset,
}
}
// newISOAdapterMultiExtent creates an adapter for a non-contiguous M2TS region.
// mr provides a virtual contiguous view over the mmap sub-slices.
// extents describes the physical layout in the ISO.
func newISOAdapterMultiExtent(parser *MPEGTSParser, mr *multiRegionData, extents []isoPhysicalRange) *isoM2TSAdapter {
// Build the extent map with cumulative logical offsets
em := make([]extentMapEntry, len(extents))
logicalOff := int64(0)
for i, ext := range extents {
em[i] = extentMapEntry{
LogicalStart: logicalOff,
ISOOffset: ext.ISOOffset,
Length: ext.Length,
}
logicalOff += ext.Length
}
return &isoM2TSAdapter{
parser: parser,
mr: mr,
extentMap: em,
}
}
// --- esDataProvider interface (used by indexer) ---
// Data returns the backing data buffer. For contiguous files, this is the
// full ISO mmap. For multi-extent files, returns nil — use DataSlice instead.
func (a *isoM2TSAdapter) Data() []byte {
if a.mr != nil {
return nil
}
return a.isoData
}
// DataSlice returns a sub-slice of the backing data at the given offset and size.
// Offsets are parser-relative (from FilteredVideoRanges). The adapter handles
// the mapping to ISO data internally.
func (a *isoM2TSAdapter) DataSlice(off int64, size int) []byte {
if a.mr != nil {
// Multi-extent: parser-relative = assembled-relative, resolve via mr
return a.mr.Slice(off, off+int64(size))
}
// Contiguous: parser-relative + baseOffset = ISO-relative
return a.isoData[off+a.baseOffset : off+a.baseOffset+int64(size)]
}
// DataSize returns the parser's data size (for bounds checking parser-relative offsets).
func (a *isoM2TSAdapter) DataSize() int64 {
return a.parser.DataSize()
}
// FilteredVideoRanges returns the parser's filtered video ranges (zero-copy).
// FileOffset values are parser-relative. Use FileOffsetConverter to get
// ISO-relative offsets for range map encoding.
func (a *isoM2TSAdapter) FilteredVideoRanges() []PESPayloadRange {
return a.parser.FilteredVideoRanges()
}
// FilteredAudioRanges returns the parser's filtered audio ranges (zero-copy).
func (a *isoM2TSAdapter) FilteredAudioRanges(subStreamID byte) []PESPayloadRange {
return a.parser.FilteredAudioRanges(subStreamID)
}
func (a *isoM2TSAdapter) ReadESData(esOffset int64, size int, isVideo bool) ([]byte, error) {
return a.parser.ReadESData(esOffset, size, isVideo)
}
func (a *isoM2TSAdapter) ReadAudioSubStreamData(subStreamID byte, esOffset int64, size int) ([]byte, error) {
return a.parser.ReadAudioSubStreamData(subStreamID, esOffset, size)
}
// --- ESReader interface (used by matcher/reconstruction) ---
func (a *isoM2TSAdapter) ESOffsetToFileOffset(esOffset int64, isVideo bool) (fileOffset int64, remaining int) {
fOff, rem := a.parser.ESOffsetToFileOffset(esOffset, isVideo)
if a.mr != nil {
// Multi-extent: parser offset is assembled-relative,
// convert to ISO-relative for range maps / reconstruction.
return a.logicalToISO(fOff), rem
}
return fOff + a.baseOffset, rem
}
func (a *isoM2TSAdapter) TotalESSize(isVideo bool) int64 {
return a.parser.TotalESSize(isVideo)
}
func (a *isoM2TSAdapter) AudioSubStreams() []byte {
return a.parser.AudioSubStreams()
}
func (a *isoM2TSAdapter) AudioSubStreamESSize(subStreamID byte) int64 {
return a.parser.AudioSubStreamESSize(subStreamID)
}
// --- PESRangeProvider interface (used for range map creation) ---
// FilteredVideoRanges and FilteredAudioRanges already defined above.
// AudioSubStreams already defined above.
// --- FileOffsetAdjuster interface ---
// FileOffsetConverter returns a function that converts parser-relative
// FileOffset values to ISO-relative offsets for range map storage.
func (a *isoM2TSAdapter) FileOffsetConverter() func(int64) int64 {
if a.mr != nil {
return a.logicalToISO
}
baseOff := a.baseOffset
return func(off int64) int64 { return off + baseOff }
}
// --- hintedESReader interface (used by matcher expand) ---
func (a *isoM2TSAdapter) ReadESByteWithHint(esOffset int64, isVideo bool, rangeHint int) (byte, int, bool) {
return a.parser.ReadESByteWithHint(esOffset, isVideo, rangeHint)
}
func (a *isoM2TSAdapter) ReadAudioByteWithHint(subStreamID byte, esOffset int64, rangeHint int) (byte, int, bool) {
return a.parser.ReadAudioByteWithHint(subStreamID, esOffset, rangeHint)
}
// IsLPCMSubStream always returns false for Blu-ray M2TS (LPCM is DVD-only).
func (a *isoM2TSAdapter) IsLPCMSubStream(_ byte) bool {
return false
}
// --- ESRangeConverter interface (for V3 format — adds baseOffset to raw ranges) ---
func (a *isoM2TSAdapter) RawRangesForESRegion(esOffset int64, size int, isVideo bool) ([]RawRange, error) {
ranges, err := a.parser.RawRangesForESRegion(esOffset, size, isVideo)
if err != nil {
return nil, err
}
return a.adjustRawRanges(ranges), nil
}
func (a *isoM2TSAdapter) RawRangesForAudioSubStream(subStreamID byte, esOffset int64, size int) ([]RawRange, error) {
ranges, err := a.parser.RawRangesForAudioSubStream(subStreamID, esOffset, size)
if err != nil {
return nil, err
}
return a.adjustRawRanges(ranges), nil
}
// --- Internal helpers ---
// adjustRawRanges creates a copy of raw ranges with offsets adjusted.
// Raw ranges are small (per-match, not per-packet) so copying is fine.
func (a *isoM2TSAdapter) adjustRawRanges(ranges []RawRange) []RawRange {
if a.mr != nil {
// Multi-extent: convert assembled-relative offsets to ISO-relative
// for range maps stored in the dedup file.
return a.mapRawRangesToISO(ranges)
}
adjusted := make([]RawRange, len(ranges))
for i, r := range ranges {
adjusted[i] = RawRange{
FileOffset: r.FileOffset + a.baseOffset,
Size: r.Size,
}
}
return adjusted
}
// logicalToISO converts a logical offset in the assembled data to the
// corresponding physical ISO offset using the extent map.
func (a *isoM2TSAdapter) logicalToISO(logicalOff int64) int64 {
if len(a.extentMap) == 0 {
return logicalOff
}
// Binary search for the extent containing this offset
idx := sort.Search(len(a.extentMap), func(i int) bool {
return a.extentMap[i].LogicalStart+a.extentMap[i].Length > logicalOff
})
if idx >= len(a.extentMap) {
// Shouldn't happen — fall back to last extent
idx = len(a.extentMap) - 1
}
e := a.extentMap[idx]
return e.ISOOffset + (logicalOff - e.LogicalStart)
}
// mapRawRangesToISO converts assembled-relative raw ranges to ISO-relative ranges.
// A single assembled range may span an extent boundary, so it may be split into
// multiple ISO ranges.
func (a *isoM2TSAdapter) mapRawRangesToISO(ranges []RawRange) []RawRange {
var result []RawRange
for _, r := range ranges {
remaining := int64(r.Size)
logOff := r.FileOffset
for remaining > 0 {
idx := sort.Search(len(a.extentMap), func(i int) bool {
return a.extentMap[i].LogicalStart+a.extentMap[i].Length > logOff
})
if idx >= len(a.extentMap) {
break
}
e := a.extentMap[idx]
offsetInExtent := logOff - e.LogicalStart
available := e.Length - offsetInExtent
chunk := remaining
if chunk > available {
chunk = available
}
result = append(result, RawRange{
FileOffset: e.ISOOffset + offsetInExtent,
Size: int(chunk),
})
logOff += chunk
remaining -= chunk
}
}
return result
}
package source
// DVD LPCM audio frame format (after 4-byte PS private stream header):
//
// Byte 0: emphasis(1) | mute(1) | reserved(1) | frame_number(5)
// Byte 1: quant_word_length(2) | sampling_freq(2) | reserved(1) | num_channels(3)
// Byte 2: dynamic_range_control
// Bytes 3+: PCM sample data (big-endian, grouped by bit depth)
//
// DVD stores big-endian samples with per-frame headers, while MKV stores
// A_PCM/INT/LIT (raw little-endian PCM, no framing). The transforms in this
// file convert between these two representations.
// LPCMHeaderSize is the size of the LPCM frame header after the 4-byte PS header.
const LPCMHeaderSize = 3
// LPCMTotalHeaderSize is the total header size to strip (4-byte PS + 3-byte LPCM).
const LPCMTotalHeaderSize = 7
// lpcmIndexSyncInterval is the interval for source-side LPCM sync points.
// One sync point per PES payload range is sufficient when the MKV side uses
// a dense interval. Keeping the source interval large minimizes hash map memory.
const lpcmIndexSyncInterval = 2048
// lpcmMatchSyncInterval is the interval for MKV-side LPCM sync points.
// DVD LPCM PES payloads are typically ~2008 bytes while MKV packets are typically
// ~6400 bytes. Since gcd(2008, 6400) = 8, using an 8-byte interval guarantees
// at least one MKV sync point aligns with each source sync point. This is
// denser than other audio codecs but adds no memory (MKV sync points are lookups,
// not stored in the hash map), and LPCM is rare.
const lpcmMatchSyncInterval = 8
// LPCMFrameHeader represents a parsed DVD LPCM frame header.
type LPCMFrameHeader struct {
Emphasis bool
Mute bool
FrameNumber byte // 5 bits
Quantization byte // 2 bits: 0=16-bit, 1=20-bit, 2=24-bit
SampleRate byte // 2 bits: 0=48kHz, 1=96kHz
Channels byte // 3 bits: number of channels minus 1
}
// ParseLPCMFrameHeader parses a 3-byte DVD LPCM frame header.
func ParseLPCMFrameHeader(data []byte) LPCMFrameHeader {
if len(data) < LPCMHeaderSize {
return LPCMFrameHeader{}
}
return LPCMFrameHeader{
Emphasis: data[0]&0x80 != 0,
Mute: data[0]&0x40 != 0,
FrameNumber: data[0] & 0x1F,
Quantization: (data[1] >> 6) & 0x03,
SampleRate: (data[1] >> 4) & 0x03,
Channels: data[1] & 0x07,
}
}
// IsLPCM16Bit returns true if the quantization code indicates 16-bit LPCM.
// Only 16-bit LPCM is supported for matching and FUSE reconstruction.
// 20-bit (code 1) and 24-bit (code 2) use grouped big-endian packing that
// changes data size during transform, making in-place FUSE reconstruction
// infeasible without significant complexity.
func IsLPCM16Bit(quantization byte) bool {
return quantization == 0
}
// TransformLPCM16BE performs an in-place byte swap for 16-bit big-endian PCM
// samples, converting to little-endian. Each pair of bytes [HI][LO] becomes
// [LO][HI]. If len(data) is odd, the last byte is left unchanged.
func TransformLPCM16BE(data []byte) {
n := len(data) &^ 1 // round down to even
for i := 0; i < n; i += 2 {
data[i], data[i+1] = data[i+1], data[i]
}
}
// InverseTransformLPCM16 converts little-endian 16-bit PCM back to big-endian.
// Byte swap is its own inverse, so this is identical to TransformLPCM16BE.
func InverseTransformLPCM16(data []byte) {
TransformLPCM16BE(data)
}
// FindLPCMIndexSyncPoints returns sync points for source-side LPCM indexing.
// Uses a large interval to keep the source hash map small.
func FindLPCMIndexSyncPoints(data []byte) []int {
if len(data) == 0 {
return nil
}
var offsets []int
for off := 0; off < len(data); off += lpcmIndexSyncInterval {
offsets = append(offsets, off)
}
return offsets
}
// FindLPCMMatchSyncPoints returns sync points for MKV-side LPCM matching.
// Uses a dense interval (8 bytes) to ensure alignment with source sync points.
// This adds no memory overhead since MKV sync points are used for hash lookups,
// not stored in the index.
func FindLPCMMatchSyncPoints(data []byte) []int {
if len(data) == 0 {
return nil
}
var offsets []int
for off := 0; off < len(data); off += lpcmMatchSyncInterval {
offsets = append(offsets, off)
}
return offsets
}
// IsLPCMSubStreamID returns true if the sub-stream ID is in the LPCM range (0xA0-0xA7).
func IsLPCMSubStreamID(subStreamID byte) bool {
return subStreamID >= 0xA0 && subStreamID <= 0xA7
}
package source
import (
"bytes"
"encoding/binary"
"fmt"
)
// MPEG-PS start codes
const (
PackStartCode = 0x000001BA
SystemHeaderCode = 0x000001BB
ProgramEndCode = 0x000001B9
PrivateStream1Code = 0x000001BD
PrivateStream2Code = 0x000001BF
PaddingStreamCode = 0x000001BE
VideoStreamMinCode = 0x000001E0
VideoStreamMaxCode = 0x000001EF
AudioStreamMinCode = 0x000001C0
AudioStreamMaxCode = 0x000001DF
)
// PESPacket represents a parsed PES packet from an MPEG-PS stream.
type PESPacket struct {
StreamID byte // Stream identifier (E0-EF = video, C0-DF = audio, BD = private)
SubStreamID byte // Sub-stream ID for Private Stream 1 (0x80-0x87 = AC3, 0x88-0x8F = DTS)
Offset int64 // Offset of the PES packet start in the file
HeaderSize int // Total header size (start code + length + PES header + private header)
PayloadOffset int64 // Offset of the actual audio/video payload
PayloadSize int // Size of the payload
IsVideo bool // True if this is a video stream
IsAudio bool // True if this is an audio stream
}
// PESPayloadRange represents a contiguous range of elementary stream payload data.
type PESPayloadRange struct {
FileOffset int64 // Offset in the MPEG-PS file
Size int // Size of this payload chunk
ESOffset int64 // Logical offset in the elementary stream
}
// MPEGPSParser parses MPEG Program Stream files to extract PES packet information.
type MPEGPSParser struct {
data []byte // Direct mmap'd data - zero-copy access
size int64
packets []PESPacket
videoRanges []PESPayloadRange
audioRanges []PESPayloadRange
audioRangeStreamIDs []byte // PES stream ID for each audioRange (0xBD or 0xC0-0xDF)
// Filtered ranges exclude user_data sections for MKV-compatible matching
filteredVideoRanges []PESPayloadRange
// Filtered audio ranges per sub-stream ID - separates interleaved audio tracks
// Each sub-stream (0x80, 0x81, etc.) gets its own filtered range set
filteredAudioBySubStream map[byte][]PESPayloadRange
// audioSubStreams lists the sub-stream IDs in order of appearance
audioSubStreams []byte
filterUserData bool
// LPCM sub-stream tracking
lpcmSubStreams map[byte]bool // which sub-streams are LPCM
lpcmInfo map[byte]LPCMFrameHeader // parsed header per LPCM sub-stream
}
// NewMPEGPSParser creates a parser for the given memory-mapped data.
// The data slice should be from a zero-copy mmap (unix.Mmap).
func NewMPEGPSParser(data []byte) *MPEGPSParser {
return &MPEGPSParser{
data: data,
size: int64(len(data)),
}
}
// MPEGPSProgressFunc is called to report MPEG-PS parsing progress.
type MPEGPSProgressFunc func(processed, total int64)
// Parse scans the file and extracts all PES packet information.
func (p *MPEGPSParser) Parse() error {
return p.ParseWithProgress(nil)
}
// ParseWithProgress scans the file with progress reporting.
func (p *MPEGPSParser) ParseWithProgress(progress MPEGPSProgressFunc) error {
pos := int64(0)
var videoESOffset, audioESOffset int64
lastProgress := int64(0)
// Pre-allocate slices to reduce reallocation churn
// Estimate: average PES packet ~2KB, so ~size/2048 packets
// We split roughly 60% video, 40% audio
estimatedPackets := int(p.size / 2048)
if estimatedPackets < 1000 {
estimatedPackets = 1000
}
p.packets = make([]PESPacket, 0, estimatedPackets)
p.videoRanges = make([]PESPayloadRange, 0, estimatedPackets*6/10)
p.audioRanges = make([]PESPayloadRange, 0, estimatedPackets*4/10)
p.audioRangeStreamIDs = make([]byte, 0, estimatedPackets*4/10)
for pos < p.size-4 {
// Direct slice access - zero copy
end := pos + 4*1024*1024 // Process in ~4MB logical chunks for progress
if end > p.size {
end = p.size
}
chunkData := p.data[pos:end]
if len(chunkData) < 4 {
break
}
// Scan for start codes within this chunk
i := 0
for i < len(chunkData)-4 {
// Fast scan for 00 00 01 prefix
if chunkData[i] != 0 {
i++
continue
}
if chunkData[i+1] != 0 {
i += 2
continue
}
if chunkData[i+2] != 1 {
i++
continue
}
// Found potential start code at pos + i
startCodePos := pos + int64(i)
startCode := uint32(0x00000100) | uint32(chunkData[i+3])
advance := int64(1)
switch {
case startCode == PackStartCode:
packSize, err := p.parsePackHeader(startCodePos)
if err == nil {
advance = int64(packSize)
}
case startCode == SystemHeaderCode:
headerLen, err := p.parseSystemHeader(startCodePos)
if err == nil {
advance = int64(headerLen)
}
case startCode == ProgramEndCode:
// End of program stream - but DVDs can have multiple programs
// (menu, main feature, extras, etc.), so continue parsing
advance = 4
case startCode == PaddingStreamCode:
length, err := p.readPESLength(startCodePos + 4)
if err == nil {
advance = 6 + int64(length)
}
case startCode == PrivateStream1Code:
pkt, err := p.parsePESPacket(startCodePos, byte(startCode&0xFF))
if err == nil {
pkt.IsAudio = true
p.packets = append(p.packets, pkt)
p.audioRanges = append(p.audioRanges, PESPayloadRange{
FileOffset: pkt.PayloadOffset,
Size: pkt.PayloadSize,
ESOffset: audioESOffset,
})
p.audioRangeStreamIDs = append(p.audioRangeStreamIDs, 0xBD)
audioESOffset += int64(pkt.PayloadSize)
advance = int64(pkt.HeaderSize + pkt.PayloadSize)
}
case startCode >= VideoStreamMinCode && startCode <= VideoStreamMaxCode:
pkt, err := p.parsePESPacket(startCodePos, byte(startCode&0xFF))
if err == nil {
pkt.IsVideo = true
p.packets = append(p.packets, pkt)
p.videoRanges = append(p.videoRanges, PESPayloadRange{
FileOffset: pkt.PayloadOffset,
Size: pkt.PayloadSize,
ESOffset: videoESOffset,
})
videoESOffset += int64(pkt.PayloadSize)
advance = int64(pkt.HeaderSize + pkt.PayloadSize)
}
case startCode >= AudioStreamMinCode && startCode <= AudioStreamMaxCode:
pkt, err := p.parsePESPacket(startCodePos, byte(startCode&0xFF))
if err == nil {
pkt.IsAudio = true
p.packets = append(p.packets, pkt)
p.audioRanges = append(p.audioRanges, PESPayloadRange{
FileOffset: pkt.PayloadOffset,
Size: pkt.PayloadSize,
ESOffset: audioESOffset,
})
p.audioRangeStreamIDs = append(p.audioRangeStreamIDs, pkt.StreamID)
audioESOffset += int64(pkt.PayloadSize)
advance = int64(pkt.HeaderSize + pkt.PayloadSize)
}
}
// Move forward by the packet size (or 1 if unknown)
newPos := startCodePos + advance
i = int(newPos - pos)
}
// Move to next chunk, but back up slightly to catch start codes at boundaries
pos += int64(len(chunkData)) - 3
if pos < 0 {
pos = 0
}
// Report progress
if progress != nil && pos-lastProgress > 100*1024*1024 { // Every 100MB
progress(pos, p.size)
lastProgress = pos
}
}
if progress != nil {
progress(p.size, p.size)
}
// Build filtered video ranges that exclude user_data (B2) sections
// This makes the ES compatible with what MKV tools produce
if err := p.buildFilteredVideoRanges(); err != nil {
return fmt.Errorf("build filtered video ranges: %w", err)
}
// Build filtered audio ranges that strip Private Stream 1 headers
// (sub-stream ID and 2-byte pointer, keeping frame count byte)
if err := p.buildFilteredAudioRanges(); err != nil {
return fmt.Errorf("build filtered audio ranges: %w", err)
}
p.filterUserData = true
return nil
}
// buildFilteredVideoRanges scans the video ES and creates ranges that exclude user_data sections.
// User_data (00 00 01 B2) is used for closed captions etc. and is stripped by MKV tools.
// Optimized to use bytes.IndexByte for fast scanning (uses SIMD on x86).
func (p *MPEGPSParser) buildFilteredVideoRanges() error {
if len(p.videoRanges) == 0 {
return nil
}
// Process each raw video range individually
// This avoids complex chunk boundary handling
// Pre-allocate with similar capacity to reduce reallocation
filteredRanges := make([]PESPayloadRange, 0, len(p.videoRanges))
var filteredESOffset int64
for _, rawRange := range p.videoRanges {
// Direct slice access - zero copy, no allocation
endOffset := rawRange.FileOffset + int64(rawRange.Size)
if endOffset > p.size {
continue
}
data := p.data[rawRange.FileOffset:endOffset]
// Scan for user_data sections within this PES payload
// Use bytes.IndexByte to quickly find 0x01 bytes (SIMD optimized)
i := 2 // Start at position 2 since we need at least 00 00 before 01
rangeStart := 0
for i < len(data)-1 {
// Find next 0x01 byte
idx := bytes.IndexByte(data[i:], 0x01)
if idx < 0 {
break
}
pos := i + idx
// Check if this is a user_data start code (00 00 01 B2)
if pos >= 2 && pos < len(data)-1 &&
data[pos-1] == 0x00 && data[pos-2] == 0x00 && data[pos+1] == UserDataStartCode {
// Found user_data - emit range before it
startCodePos := pos - 2
if startCodePos > rangeStart {
filteredRanges = append(filteredRanges, PESPayloadRange{
FileOffset: rawRange.FileOffset + int64(rangeStart),
Size: startCodePos - rangeStart,
ESOffset: filteredESOffset,
})
filteredESOffset += int64(startCodePos - rangeStart)
}
// Skip user_data section to next start code using fast scan
i = pos + 2
for i < len(data)-1 {
idx := bytes.IndexByte(data[i:], 0x01)
if idx < 0 {
i = len(data)
break
}
nextPos := i + idx
if nextPos >= 2 && data[nextPos-1] == 0x00 && data[nextPos-2] == 0x00 {
// Found next start code
i = nextPos - 2
break
}
i = nextPos + 1
}
rangeStart = i
} else {
i = pos + 1
}
}
// Emit remaining data in this PES payload
if rangeStart < len(data) {
filteredRanges = append(filteredRanges, PESPayloadRange{
FileOffset: rawRange.FileOffset + int64(rangeStart),
Size: len(data) - rangeStart,
ESOffset: filteredESOffset,
})
filteredESOffset += int64(len(data) - rangeStart)
}
}
p.filteredVideoRanges = filteredRanges
return nil
}
// buildFilteredAudioRanges creates per-sub-stream filtered audio ranges.
//
// For Private Stream 1 (0xBD), DVD audio has this structure:
//
// Byte 0: sub-stream ID (0x80-0x87 = AC3, 0x88-0x8F = DTS, etc.)
// Byte 1: number of audio frames
// Bytes 2-3: first access unit pointer (offset to first audio frame)
// Bytes 4+: audio data (for AC3/DTS)
//
// For LPCM sub-streams (0xA0-0xA7), there are 3 additional header bytes after the
// 4-byte PS header (emphasis/mute/frame_number, quant/samplerate/channels, DRC),
// so we strip 7 bytes total. The LPCM header is parsed once per sub-stream.
//
// For MPEG-1 audio streams (0xC0-0xDF), the PES payload is raw MP2 frame data
// with no sub-stream header. The stream ID is used as a pseudo sub-stream ID.
//
// Each sub-stream ID gets its own separate filtered ES to avoid interleaving issues.
func (p *MPEGPSParser) buildFilteredAudioRanges() error {
if len(p.audioRanges) == 0 {
return nil
}
// Map to track ranges per sub-stream
rangesBySubStream := make(map[byte][]PESPayloadRange)
esOffsetBySubStream := make(map[byte]int64)
seenSubStreams := make(map[byte]bool)
p.lpcmSubStreams = make(map[byte]bool)
p.lpcmInfo = make(map[byte]LPCMFrameHeader)
for i, rawRange := range p.audioRanges {
if rawRange.FileOffset >= p.size {
continue
}
pesStreamID := p.audioRangeStreamIDs[i]
// MPEG-1 audio streams (0xC0-0xDF): payload is raw MP2 data, no sub-stream header
if pesStreamID >= 0xC0 && pesStreamID <= 0xDF {
if rawRange.Size <= 0 {
continue
}
// Use the PES stream ID as a pseudo sub-stream ID
if !seenSubStreams[pesStreamID] {
seenSubStreams[pesStreamID] = true
p.audioSubStreams = append(p.audioSubStreams, pesStreamID)
}
esOffset := esOffsetBySubStream[pesStreamID]
rangesBySubStream[pesStreamID] = append(rangesBySubStream[pesStreamID], PESPayloadRange{
FileOffset: rawRange.FileOffset,
Size: rawRange.Size,
ESOffset: esOffset,
})
esOffsetBySubStream[pesStreamID] += int64(rawRange.Size)
continue
}
// Private Stream 1 (0xBD): has sub-stream header
if rawRange.Size < 4 {
continue
}
subStreamID := p.data[rawRange.FileOffset]
// Check if this is AC3, DTS, or LPCM
isAC3 := subStreamID >= 0x80 && subStreamID <= 0x87
isDTS := subStreamID >= 0x88 && subStreamID <= 0x8F
isLPCM := subStreamID >= 0xA0 && subStreamID <= 0xA7
if isAC3 || isDTS || isLPCM {
// Track sub-stream order
if !seenSubStreams[subStreamID] {
seenSubStreams[subStreamID] = true
p.audioSubStreams = append(p.audioSubStreams, subStreamID)
}
if isLPCM {
// Strip 7 bytes: 4-byte PS header + 3-byte LPCM frame header
if rawRange.Size > LPCMTotalHeaderSize {
// Parse LPCM header on first packet to get bit depth
if _, ok := p.lpcmInfo[subStreamID]; !ok {
headerEnd := rawRange.FileOffset + 4 + LPCMHeaderSize
if headerEnd > p.size {
continue
}
headerData := p.data[rawRange.FileOffset+4 : headerEnd]
info := ParseLPCMFrameHeader(headerData)
p.lpcmInfo[subStreamID] = info
// Only 16-bit LPCM is supported for byte-swap matching.
// 20/24-bit uses grouped packing that changes data size
// during transform, so it falls through to delta.
if IsLPCM16Bit(info.Quantization) {
p.lpcmSubStreams[subStreamID] = true
}
}
esOffset := esOffsetBySubStream[subStreamID]
rangesBySubStream[subStreamID] = append(rangesBySubStream[subStreamID], PESPayloadRange{
FileOffset: rawRange.FileOffset + LPCMTotalHeaderSize,
Size: rawRange.Size - LPCMTotalHeaderSize,
ESOffset: esOffset,
})
esOffsetBySubStream[subStreamID] += int64(rawRange.Size - LPCMTotalHeaderSize)
}
} else {
// Strip the entire 4-byte header, keep only raw audio data
if rawRange.Size > 4 {
esOffset := esOffsetBySubStream[subStreamID]
rangesBySubStream[subStreamID] = append(rangesBySubStream[subStreamID], PESPayloadRange{
FileOffset: rawRange.FileOffset + 4, // Skip header (1 + 1 + 2)
Size: rawRange.Size - 4, // Rest is audio data
ESOffset: esOffset,
})
esOffsetBySubStream[subStreamID] += int64(rawRange.Size - 4)
}
}
}
// Skip unknown sub-stream types (like subtitles 0x20-0x3F)
}
p.filteredAudioBySubStream = rangesBySubStream
return nil
}
// parsePackHeader parses an MPEG-2 pack header and returns its size.
func (p *MPEGPSParser) parsePackHeader(pos int64) (int, error) {
// MPEG-2 pack header is 14 bytes minimum
// Format: 00 00 01 BA + SCR (6 bytes) + mux_rate (3 bytes) + stuffing
if pos+14 > p.size {
return 0, fmt.Errorf("failed to read pack header")
}
buf := p.data[pos : pos+14]
// Check if this is MPEG-2 (starts with 01) or MPEG-1 (starts with 0010)
if buf[4]&0xC0 == 0x40 {
// MPEG-2 pack header
stuffingLen := int(buf[13] & 0x07)
return 14 + stuffingLen, nil
}
// MPEG-1 pack header is 12 bytes
return 12, nil
}
// parseSystemHeader parses a system header and returns its total size.
func (p *MPEGPSParser) parseSystemHeader(pos int64) (int, error) {
length, err := p.readPESLength(pos + 4)
if err != nil {
return 0, err
}
return 6 + int(length), nil
}
// readPESLength reads the 2-byte PES packet length field.
func (p *MPEGPSParser) readPESLength(pos int64) (uint16, error) {
if pos+2 > p.size {
return 0, fmt.Errorf("failed to read PES length")
}
return binary.BigEndian.Uint16(p.data[pos : pos+2]), nil
}
// parsePESPacket parses a PES packet header and returns packet info.
func (p *MPEGPSParser) parsePESPacket(pos int64, streamID byte) (PESPacket, error) {
pkt := PESPacket{
StreamID: streamID,
Offset: pos,
}
// Read length field
length, err := p.readPESLength(pos + 4)
if err != nil {
return pkt, err
}
// PES packet structure after start code + stream ID + length:
// - 2 bits: '10'
// - 2 bits: PES_scrambling_control
// - 1 bit: PES_priority
// - 1 bit: data_alignment_indicator
// - 1 bit: copyright
// - 1 bit: original_or_copy
// - 2 bits: PTS_DTS_flags
// - 1 bit: ESCR_flag
// - 1 bit: ES_rate_flag
// - 1 bit: DSM_trick_mode_flag
// - 1 bit: additional_copy_info_flag
// - 1 bit: PES_CRC_flag
// - 1 bit: PES_extension_flag
// - 8 bits: PES_header_data_length
// Then optional fields based on flags
// Direct slice access for PES header fields
if pos+9 > p.size {
return pkt, fmt.Errorf("failed to read PES header")
}
buf := p.data[pos+6 : pos+9]
// Check for MPEG-2 PES (starts with 10)
if buf[0]&0xC0 == 0x80 {
// MPEG-2 PES header
headerDataLen := int(buf[2])
pkt.HeaderSize = 6 + 3 + headerDataLen // start code(4) + length(2) + flags(2) + header_len(1) + header_data
pkt.PayloadOffset = pos + int64(pkt.HeaderSize)
pkt.PayloadSize = int(length) - 3 - headerDataLen
} else {
// MPEG-1 PES header - simpler structure
// Skip stuffing bytes (0xFF) and find actual header
headerLen := 0
offset := pos + 6
for {
if offset+int64(headerLen) >= p.size {
return pkt, fmt.Errorf("failed to read PES header: offset out of range")
}
b := p.data[offset+int64(headerLen)]
if b == 0xFF {
headerLen++
if headerLen > 16 { // Safety limit
break
}
continue
}
if b&0xC0 == 0x40 {
// STD buffer
headerLen += 2
continue
}
if b&0xF0 == 0x20 {
// PTS only
headerLen += 5
} else if b&0xF0 == 0x30 {
// PTS + DTS
headerLen += 10
} else if b == 0x0F {
// No timestamps
headerLen++
}
break
}
pkt.HeaderSize = 6 + headerLen
pkt.PayloadOffset = pos + int64(pkt.HeaderSize)
pkt.PayloadSize = int(length) - headerLen
}
if pkt.PayloadSize < 0 {
pkt.PayloadSize = 0
}
return pkt, nil
}
// VideoRanges returns all video payload ranges found in the stream.
func (p *MPEGPSParser) VideoRanges() []PESPayloadRange {
return p.videoRanges
}
// FilteredVideoRangesCount returns the number of filtered video ranges.
func (p *MPEGPSParser) FilteredVideoRangesCount() int {
return len(p.filteredVideoRanges)
}
// RawVideoESSize returns the total size of raw (unfiltered) video ES.
func (p *MPEGPSParser) RawVideoESSize() int64 {
if len(p.videoRanges) == 0 {
return 0
}
last := p.videoRanges[len(p.videoRanges)-1]
return last.ESOffset + int64(last.Size)
}
// AudioRanges returns all audio payload ranges found in the stream.
func (p *MPEGPSParser) AudioRanges() []PESPayloadRange {
return p.audioRanges
}
// Packets returns all parsed PES packets.
func (p *MPEGPSParser) Packets() []PESPacket {
return p.packets
}
// FileOffsetToESOffset converts a file offset within a payload to an ES offset.
// Returns -1 if the offset is not within a known payload range.
func (p *MPEGPSParser) FileOffsetToESOffset(fileOffset int64, isVideo bool) int64 {
ranges := p.audioRanges
if isVideo {
ranges = p.videoRanges
}
for _, r := range ranges {
if fileOffset >= r.FileOffset && fileOffset < r.FileOffset+int64(r.Size) {
offsetInPayload := fileOffset - r.FileOffset
return r.ESOffset + offsetInPayload
}
}
return -1
}
// ESOffsetToFileOffset converts an ES offset to a file offset.
// Returns the file offset and payload remaining size, or -1 if not found.
func (p *MPEGPSParser) ESOffsetToFileOffset(esOffset int64, isVideo bool) (fileOffset int64, remaining int) {
ranges := p.audioRanges
if isVideo {
ranges = p.videoRanges
}
for _, r := range ranges {
if esOffset >= r.ESOffset && esOffset < r.ESOffset+int64(r.Size) {
offsetInPayload := esOffset - r.ESOffset
return r.FileOffset + offsetInPayload, r.Size - int(offsetInPayload)
}
}
return -1, 0
}
// TotalESSize returns the total size of the elementary stream.
// For video, returns filtered ES size when filtering is enabled.
// For audio, this returns 0 - use AudioSubStreamESSize instead.
func (p *MPEGPSParser) TotalESSize(isVideo bool) int64 {
if !isVideo {
return 0
}
if p.filterUserData && len(p.filteredVideoRanges) > 0 {
return totalESSizeFromRanges(p.filteredVideoRanges)
}
return totalESSizeFromRanges(p.videoRanges)
}
// AudioSubStreams returns the list of audio sub-stream IDs in order of appearance.
func (p *MPEGPSParser) AudioSubStreams() []byte {
return p.audioSubStreams
}
// AudioSubStreamCount returns the number of audio sub-streams.
func (p *MPEGPSParser) AudioSubStreamCount() int {
return len(p.audioSubStreams)
}
// AudioSubStreamESSize returns the total ES size for a specific audio sub-stream.
func (p *MPEGPSParser) AudioSubStreamESSize(subStreamID byte) int64 {
return totalESSizeFromRanges(p.filteredAudioBySubStream[subStreamID])
}
// FilteredVideoRanges returns the filtered video payload ranges for zero-copy iteration.
// Returns the raw video ranges if filtering is not enabled.
func (p *MPEGPSParser) FilteredVideoRanges() []PESPayloadRange {
if p.filterUserData && len(p.filteredVideoRanges) > 0 {
return p.filteredVideoRanges
}
return p.videoRanges
}
// FilteredAudioRanges returns the filtered audio payload ranges for a specific sub-stream.
// Returns nil if the sub-stream doesn't exist.
func (p *MPEGPSParser) FilteredAudioRanges(subStreamID byte) []PESPayloadRange {
return p.filteredAudioBySubStream[subStreamID]
}
// Data returns the raw mmap'd file data for zero-copy access.
func (p *MPEGPSParser) Data() []byte {
return p.data
}
// DataSlice returns a sub-slice of the backing data at the given offset and size.
func (p *MPEGPSParser) DataSlice(off int64, size int) []byte {
return p.data[off : off+int64(size)]
}
// DataSize returns the total size of the backing data.
func (p *MPEGPSParser) DataSize() int64 {
return p.size
}
// ReadESByteWithHint reads a single byte from the ES stream, using a range hint
// to avoid binary search when reading sequentially. Returns the byte, the range
// index where it was found (for use as hint on next call), and success status.
// Pass rangeHint=-1 to force binary search.
func (p *MPEGPSParser) ReadESByteWithHint(esOffset int64, isVideo bool, rangeHint int) (byte, int, bool) {
if !isVideo {
// Audio doesn't use this method - it goes through sub-stream reader
return 0, -1, false
}
var ranges []PESPayloadRange
if p.filterUserData && len(p.filteredVideoRanges) > 0 {
ranges = p.filteredVideoRanges
} else {
ranges = p.videoRanges
}
return readByteWithHint(p.data, nil, p.size, ranges, esOffset, rangeHint)
}
// ReadAudioByteWithHint reads a single byte from an audio sub-stream, using a range hint.
// For LPCM sub-streams (16-bit only), swaps even/odd byte positions to convert big-endian to little-endian.
func (p *MPEGPSParser) ReadAudioByteWithHint(subStreamID byte, esOffset int64, rangeHint int) (byte, int, bool) {
if p.lpcmSubStreams[subStreamID] {
// Swap even/odd byte position: XOR with 1
swappedOffset := esOffset ^ 1
return readByteWithHint(p.data, nil, p.size, p.filteredAudioBySubStream[subStreamID], swappedOffset, rangeHint)
}
return readByteWithHint(p.data, nil, p.size, p.filteredAudioBySubStream[subStreamID], esOffset, rangeHint)
}
// Video start codes that should be KEPT (not user_data)
const (
UserDataStartCode = 0xB2 // This gets stripped by MKV tools
)
// RawRange represents a contiguous chunk of raw file data corresponding to
// part of an ES region. Used for converting ES offsets to raw file offsets.
type RawRange struct {
FileOffset int64 // Offset in the raw file
Size int // Size of this chunk
}
// RawRangesForESRegion returns the raw file ranges that contain the given ES region.
// For video streams only - audio should use RawRangesForAudioSubStream.
func (p *MPEGPSParser) RawRangesForESRegion(esOffset int64, size int, isVideo bool) ([]RawRange, error) {
if !isVideo {
return nil, fmt.Errorf("audio uses per-sub-stream methods, use RawRangesForAudioSubStream")
}
var ranges []PESPayloadRange
if p.filterUserData && len(p.filteredVideoRanges) > 0 {
ranges = p.filteredVideoRanges
} else {
ranges = p.videoRanges
}
return rawRangesFromPESRanges(ranges, esOffset, size)
}
// RawRangesForAudioSubStream returns the raw file ranges for audio data from a specific sub-stream.
func (p *MPEGPSParser) RawRangesForAudioSubStream(subStreamID byte, esOffset int64, size int) ([]RawRange, error) {
ranges, ok := p.filteredAudioBySubStream[subStreamID]
if !ok {
return nil, fmt.Errorf("audio sub-stream 0x%02X not found", subStreamID)
}
return rawRangesFromPESRanges(ranges, esOffset, size)
}
// ReadESData reads elementary stream data at the given ES offset.
// For video, returns FILTERED ES data (excludes user_data sections).
// For audio, returns error - use ReadAudioSubStreamData instead.
func (p *MPEGPSParser) ReadESData(esOffset int64, size int, isVideo bool) ([]byte, error) {
if !isVideo {
return nil, fmt.Errorf("audio uses per-sub-stream methods, use ReadAudioSubStreamData")
}
var ranges []PESPayloadRange
if p.filterUserData && len(p.filteredVideoRanges) > 0 {
ranges = p.filteredVideoRanges
} else {
ranges = p.videoRanges
}
return readFromRanges(p.data, nil, p.size, ranges, esOffset, size)
}
// ReadAudioSubStreamData reads audio data from a specific sub-stream.
// For LPCM sub-streams, the data is byte-swapped to match MKV little-endian format.
// Handles alignment: if esOffset is odd, reads from the pair-aligned offset,
// swaps, and returns only the requested portion.
func (p *MPEGPSParser) ReadAudioSubStreamData(subStreamID byte, esOffset int64, size int) ([]byte, error) {
ranges, ok := p.filteredAudioBySubStream[subStreamID]
if !ok {
return nil, fmt.Errorf("audio sub-stream 0x%02X not found", subStreamID)
}
if !p.lpcmSubStreams[subStreamID] {
return readFromRanges(p.data, nil, p.size, ranges, esOffset, size)
}
// LPCM 16-bit forward transform (DVD big-endian → MKV little-endian).
// Byte-swap pairs are aligned to the ES start (pairs at offsets 0-1, 2-3, ...).
// If esOffset is odd, we must read one extra byte before to complete the pair.
alignedOffset := esOffset
trimFront := 0
if esOffset%2 == 1 {
alignedOffset = esOffset - 1
trimFront = 1
}
alignedSize := size + trimFront
// If alignedSize is odd, extend by 1 to complete the trailing pair
// (if data is available).
trimBack := 0
if alignedSize%2 == 1 {
alignedSize++
trimBack = 1
}
data, err := readFromRanges(p.data, nil, p.size, ranges, alignedOffset, alignedSize)
if err != nil {
// If extending caused an out-of-range error, retry without the trailing extension
if trimBack > 0 {
alignedSize--
trimBack = 0
data, err = readFromRanges(p.data, nil, p.size, ranges, alignedOffset, alignedSize)
}
if err != nil {
return nil, err
}
}
// readFromRanges may return a zero-copy mmap slice, so clone first
result := make([]byte, len(data))
copy(result, data)
TransformLPCM16BE(result)
// Trim to the originally requested range
start := trimFront
end := start + size
if end > len(result) {
end = len(result)
}
return result[start:end], nil
}
// IsLPCMSubStream returns true if the given sub-stream ID is an LPCM sub-stream.
func (p *MPEGPSParser) IsLPCMSubStream(subStreamID byte) bool {
return p.lpcmSubStreams[subStreamID]
}
package source
import (
"fmt"
"io"
"os"
"strings"
)
// detectDVDCodecs extracts codec information from an already-indexed DVD source.
// The MPEG-PS parser has already identified video and audio streams during indexing.
func detectDVDCodecs(index *Index) (*SourceCodecs, error) {
codecs := &SourceCodecs{}
for _, esReader := range index.ESReaders {
parser, ok := esReader.(*MPEGPSParser)
if !ok {
continue
}
// Video: DVD is MPEG-2 if video ranges exist
if parser.TotalESSize(true) > 0 {
if !containsCodec(codecs.VideoCodecs, CodecMPEG2Video) {
codecs.VideoCodecs = append(codecs.VideoCodecs, CodecMPEG2Video)
}
}
// Audio from sub-streams (Private Stream 1 and MPEG-1 audio)
for _, subStreamID := range parser.AudioSubStreams() {
var ct CodecType
switch {
case subStreamID >= 0x80 && subStreamID <= 0x87:
ct = CodecAC3Audio
case subStreamID >= 0x88 && subStreamID <= 0x8F:
ct = CodecDTSAudio
case subStreamID >= 0xA0 && subStreamID <= 0xA7:
ct = CodecLPCMAudio
case subStreamID >= 0xC0 && subStreamID <= 0xDF:
ct = CodecMPEGAudio
default:
continue
}
if !containsCodec(codecs.AudioCodecs, ct) {
codecs.AudioCodecs = append(codecs.AudioCodecs, ct)
}
}
}
return codecs, nil
}
// detectDVDCodecsFromFile detects codecs from a DVD ISO by parsing VTS IFO
// metadata files. IFO files authoritatively declare every stream in each title
// set, unlike PES scanning which can miss audio streams that appear later in
// the VOB data. Falls back to PES scanning if IFO parsing fails.
func detectDVDCodecsFromFile(path string) (*SourceCodecs, error) {
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("open ISO file: %w", err)
}
defer f.Close()
// Try IFO-based detection first (ISO9660, then UDF).
if ifos := findIFOsInISO(f); len(ifos) > 0 {
codecs, err := detectDVDCodecsFromIFOs(f, ifos)
if err == nil {
return codecs, nil
}
}
if ifos, err := findIFOsInUDF(f); err == nil && len(ifos) > 0 {
codecs, err := detectDVDCodecsFromIFOs(f, ifos)
if err == nil {
return codecs, nil
}
}
// Fallback: scan PES data from VOBs.
return detectDVDCodecsFromFilePES(f)
}
// detectDVDCodecsFromFilePES scans PES start codes in VOB data to detect codecs.
// This is the legacy approach, kept as a fallback for ISOs where IFO parsing fails.
func detectDVDCodecsFromFilePES(f *os.File) (*SourceCodecs, error) {
vobs := findContentVOBs(f)
if len(vobs) == 0 {
return scanDVDRegion(f, 0) // fallback: scan from start of ISO
}
merged := &SourceCodecs{}
var lastErr error
anySuccess := false
for _, v := range significantFiles(vobs) {
codecs, err := scanDVDRegion(f, v.Offset)
if err != nil {
lastErr = err
continue
}
mergeSourceCodecs(merged, codecs)
anySuccess = true
}
if !anySuccess {
// Fall back to scanning from start of ISO
fallback, err := scanDVDRegion(f, 0)
if err == nil {
return fallback, nil
}
if lastErr != nil {
return nil, fmt.Errorf("failed to scan any DVD VOBs: %w", lastErr)
}
return nil, err
}
return merged, nil
}
// scanDVDRegion reads 4MB from the given offset and scans for MPEG-PS codecs.
func scanDVDRegion(f *os.File, offset int64) (*SourceCodecs, error) {
const scanSize = 4 * 1024 * 1024
buf := make([]byte, scanSize)
n, err := f.ReadAt(buf, offset)
if err == io.EOF || err == io.ErrUnexpectedEOF {
buf = buf[:n]
} else if err != nil {
return nil, fmt.Errorf("read %s at offset %d: %w", f.Name(), offset, err)
}
if n == 0 {
return nil, fmt.Errorf("no data at offset %d in %s", offset, f.Name())
}
return scanPESCodecs(buf)
}
// scanPESCodecs scans a byte buffer for MPEG-PS PES headers and extracts codec information.
func scanPESCodecs(buf []byte) (*SourceCodecs, error) {
codecs := &SourceCodecs{}
// Scan for PES start codes: 0x00 0x00 0x01 <stream_id>
for i := 0; i+3 < len(buf); i++ {
if buf[i] != 0x00 || buf[i+1] != 0x00 || buf[i+2] != 0x01 {
continue
}
streamID := buf[i+3]
switch {
case streamID >= 0xE0 && streamID <= 0xEF:
// Video stream — DVD is MPEG-2
if !containsCodec(codecs.VideoCodecs, CodecMPEG2Video) {
codecs.VideoCodecs = append(codecs.VideoCodecs, CodecMPEG2Video)
}
case streamID == 0xBD:
// Private Stream 1 — contains AC3, DTS, LPCM sub-streams
// Parse the PES header to get to the sub-stream ID
if i+9 < len(buf) {
pesHeaderLen := int(buf[i+8])
subStreamOffset := i + 9 + pesHeaderLen
if subStreamOffset < len(buf) {
subStreamID := buf[subStreamOffset]
var ct CodecType
switch {
case subStreamID >= 0x80 && subStreamID <= 0x87:
ct = CodecAC3Audio
case subStreamID >= 0x88 && subStreamID <= 0x8F:
ct = CodecDTSAudio
case subStreamID >= 0xA0 && subStreamID <= 0xA7:
ct = CodecLPCMAudio
}
if ct != CodecUnknown && !containsCodec(codecs.AudioCodecs, ct) {
codecs.AudioCodecs = append(codecs.AudioCodecs, ct)
}
}
}
case streamID >= 0xC0 && streamID <= 0xDF:
// MPEG audio stream
if !containsCodec(codecs.AudioCodecs, CodecMPEGAudio) {
codecs.AudioCodecs = append(codecs.AudioCodecs, CodecMPEGAudio)
}
}
}
if len(codecs.VideoCodecs) == 0 && len(codecs.AudioCodecs) == 0 {
return nil, fmt.Errorf("no DVD codecs detected in scanned region")
}
return codecs, nil
}
// findContentVOBs navigates the ISO9660 filesystem to find all content VOBs
// (VTS_xx_1.VOB, the first part of each title set). Returns nil if navigation
// fails, signaling the caller to fall back to scanning from the ISO start.
// Uses readISOPVDRoot/readISODirectory/findISOEntry from iso.go.
func findContentVOBs(f *os.File) []isoFileExtent {
rootLBA, rootLen, err := readISOPVDRoot(f)
if err != nil {
return nil
}
rootEntries, err := readISODirectory(f, rootLBA, rootLen)
if err != nil {
return nil
}
videoTS, err := findISOEntry(rootEntries, "VIDEO_TS")
if err != nil {
return nil
}
vtsEntries, err := readISODirectory(f, uint32(videoTS.Offset/isoSectorSize), uint32(videoTS.Size))
if err != nil {
return nil
}
// Collect VTS_xx_1.VOB entries — the first content VOB of each title set.
// VTS_xx_0.VOB is navigation-only, and VTS_xx_2+ are continuations with
// the same audio layout, so only _1 from each title set is needed.
var vobs []isoFileExtent
for _, e := range vtsEntries {
if e.IsDir {
continue
}
if strings.HasPrefix(e.Name, "VTS_") && strings.HasSuffix(e.Name, ".VOB") {
if len(e.Name) == 12 && e.Name[7] == '1' {
vobs = append(vobs, e)
}
}
}
return vobs
}
package source
import "fmt"
// MPEGTSParser parses MPEG Transport Stream (M2TS) files to extract elementary
// stream data. This is the Blu-ray equivalent of MPEGPSParser for DVDs.
//
// M2TS files use 192-byte packets: 4-byte timestamp + 188-byte TS packet.
// Each TS packet carries a fragment of a PES packet, identified by PID.
// PES packets span multiple TS packets and contain the actual codec data.
//
// The parser builds PES payload range tables that map ES offsets to raw file
// offsets, enabling the matcher to work with continuous ES data while the
// underlying file has TS headers interleaved.
type MPEGTSParser struct {
data []byte // mmap'd file data (zero-copy); nil when using multiRegion
multiRegion *multiRegionData // non-nil for multi-extent UDF files
size int64
packetSize int // 192 (M2TS) or 188 (standard TS)
tsOffset int // offset from packet start to TS sync byte (4 for M2TS, 0 for TS)
// Stream PIDs from PMT
videoPID uint16
audioPIDs []uint16 // ordered by PMT appearance
videoCodec CodecType // for user_data filtering decision
// PES payload ranges (one entry per TS payload chunk for tracked PIDs)
videoRanges []PESPayloadRange
filteredVideoRanges []PESPayloadRange // excludes user_data for MPEG-2 only
audioBySubStream map[byte][]PESPayloadRange
// Audio PID → sub-stream ID mapping
audioSubStreams []byte // sequential IDs: 0, 1, 2, ...
pidToSubStream map[uint16]byte // PID → sub-stream ID
subStreamToPID map[byte]uint16 // sub-stream ID → PID
subStreamCodec map[byte]CodecType // codec type per sub-stream
filterUserData bool
}
// NewMPEGTSParser creates a parser for the given memory-mapped M2TS data.
func NewMPEGTSParser(data []byte) *MPEGTSParser {
return &MPEGTSParser{
data: data,
size: int64(len(data)),
audioBySubStream: make(map[byte][]PESPayloadRange),
pidToSubStream: make(map[uint16]byte),
subStreamToPID: make(map[byte]uint16),
subStreamCodec: make(map[byte]CodecType),
}
}
// NewMPEGTSParserMultiRegion creates a parser for non-contiguous M2TS data
// from a multi-extent UDF file. The multiRegionData provides a virtual
// contiguous view over multiple mmap sub-slices.
func NewMPEGTSParserMultiRegion(mr *multiRegionData) *MPEGTSParser {
return &MPEGTSParser{
multiRegion: mr,
size: mr.Len(),
audioBySubStream: make(map[byte][]PESPayloadRange),
pidToSubStream: make(map[uint16]byte),
subStreamToPID: make(map[byte]uint16),
subStreamCodec: make(map[byte]CodecType),
}
}
// dataSlice returns a sub-slice of the parser's data source.
// Uses multiRegion when available, otherwise direct slice of p.data.
func (p *MPEGTSParser) dataSlice(off, end int64) []byte {
if p.multiRegion != nil {
return p.multiRegion.Slice(off, end)
}
return p.data[off:end]
}
// MPEGTSProgressFunc is called to report MPEG-TS parsing progress.
type MPEGTSProgressFunc func(processed, total int64)
// --- ESReader interface implementation ---
// ReadESData reads elementary stream data at the given ES offset.
func (p *MPEGTSParser) ReadESData(esOffset int64, size int, isVideo bool) ([]byte, error) {
if !isVideo {
return nil, fmt.Errorf("audio uses per-sub-stream methods, use ReadAudioSubStreamData")
}
ranges := p.filteredVideoRanges
if len(ranges) == 0 {
ranges = p.videoRanges
}
return readFromRanges(p.data, p.multiRegion, p.size, ranges, esOffset, size)
}
// ESOffsetToFileOffset converts an ES offset to a file offset and remaining bytes.
func (p *MPEGTSParser) ESOffsetToFileOffset(esOffset int64, isVideo bool) (fileOffset int64, remaining int) {
var ranges []PESPayloadRange
if isVideo {
ranges = p.filteredVideoRanges
if len(ranges) == 0 {
ranges = p.videoRanges
}
} else {
return -1, 0
}
idx := binarySearchRanges(ranges, esOffset)
if idx < 0 {
return -1, 0
}
r := ranges[idx]
offsetInPayload := esOffset - r.ESOffset
return r.FileOffset + offsetInPayload, r.Size - int(offsetInPayload)
}
// TotalESSize returns the total size of the elementary stream.
func (p *MPEGTSParser) TotalESSize(isVideo bool) int64 {
if !isVideo {
return 0
}
if p.filterUserData && len(p.filteredVideoRanges) > 0 {
return totalESSizeFromRanges(p.filteredVideoRanges)
}
return totalESSizeFromRanges(p.videoRanges)
}
// AudioSubStreams returns the list of audio sub-stream IDs.
func (p *MPEGTSParser) AudioSubStreams() []byte {
return p.audioSubStreams
}
// SubtitleSubStreams returns the sub-stream IDs that carry subtitle data (e.g., PGS).
func (p *MPEGTSParser) SubtitleSubStreams() []byte {
var ids []byte
for _, id := range p.audioSubStreams {
if IsSubtitleCodec(p.subStreamCodec[id]) {
ids = append(ids, id)
}
}
return ids
}
// AudioSubStreamESSize returns the ES size for a specific audio sub-stream.
func (p *MPEGTSParser) AudioSubStreamESSize(subStreamID byte) int64 {
return totalESSizeFromRanges(p.audioBySubStream[subStreamID])
}
// ReadAudioSubStreamData reads audio data from a specific sub-stream.
func (p *MPEGTSParser) ReadAudioSubStreamData(subStreamID byte, esOffset int64, size int) ([]byte, error) {
ranges, ok := p.audioBySubStream[subStreamID]
if !ok {
return nil, fmt.Errorf("audio sub-stream %d not found", subStreamID)
}
return readFromRanges(p.data, p.multiRegion, p.size, ranges, esOffset, size)
}
// --- ESRangeConverter interface implementation ---
// RawRangesForESRegion returns the raw file ranges for a video ES region.
func (p *MPEGTSParser) RawRangesForESRegion(esOffset int64, size int, isVideo bool) ([]RawRange, error) {
if !isVideo {
return nil, fmt.Errorf("audio uses per-sub-stream methods, use RawRangesForAudioSubStream")
}
ranges := p.filteredVideoRanges
if len(ranges) == 0 {
ranges = p.videoRanges
}
return rawRangesFromPESRanges(ranges, esOffset, size)
}
// RawRangesForAudioSubStream returns the raw file ranges for audio data from a specific sub-stream.
func (p *MPEGTSParser) RawRangesForAudioSubStream(subStreamID byte, esOffset int64, size int) ([]RawRange, error) {
ranges, ok := p.audioBySubStream[subStreamID]
if !ok {
return nil, fmt.Errorf("audio sub-stream %d not found", subStreamID)
}
return rawRangesFromPESRanges(ranges, esOffset, size)
}
// --- Hint-based reading for matcher hot path ---
// ReadESByteWithHint reads a single byte from the ES stream with a range hint.
func (p *MPEGTSParser) ReadESByteWithHint(esOffset int64, isVideo bool, rangeHint int) (byte, int, bool) {
if !isVideo {
return 0, -1, false
}
ranges := p.filteredVideoRanges
if len(ranges) == 0 {
ranges = p.videoRanges
}
return readByteWithHint(p.data, p.multiRegion, p.size, ranges, esOffset, rangeHint)
}
// ReadAudioByteWithHint reads a single byte from an audio sub-stream with a range hint.
func (p *MPEGTSParser) ReadAudioByteWithHint(subStreamID byte, esOffset int64, rangeHint int) (byte, int, bool) {
return readByteWithHint(p.data, p.multiRegion, p.size, p.audioBySubStream[subStreamID], esOffset, rangeHint)
}
// IsLPCMSubStream always returns false for MPEG-TS (LPCM is DVD-only).
func (p *MPEGTSParser) IsLPCMSubStream(_ byte) bool {
return false
}
// --- Accessors for indexer ---
// Data returns the raw mmap'd file data for zero-copy access.
// Returns nil when using multi-region data; use DataSlice instead.
func (p *MPEGTSParser) Data() []byte {
return p.data
}
// DataSlice returns a sub-slice of the backing data at the given offset and size.
// Works for both contiguous and multi-region data.
func (p *MPEGTSParser) DataSlice(off int64, size int) []byte {
if p.multiRegion != nil {
return p.multiRegion.Slice(off, off+int64(size))
}
return p.data[off : off+int64(size)]
}
// DataSize returns the total size of the backing data.
func (p *MPEGTSParser) DataSize() int64 {
return p.size
}
// FilteredVideoRanges returns the filtered video payload ranges.
func (p *MPEGTSParser) FilteredVideoRanges() []PESPayloadRange {
if p.filterUserData && len(p.filteredVideoRanges) > 0 {
return p.filteredVideoRanges
}
return p.videoRanges
}
// FilteredAudioRanges returns the audio payload ranges for a specific sub-stream.
func (p *MPEGTSParser) FilteredAudioRanges(subStreamID byte) []PESPayloadRange {
return p.audioBySubStream[subStreamID]
}
// RawVideoESSize returns the total size of raw (unfiltered) video ES.
func (p *MPEGTSParser) RawVideoESSize() int64 {
return totalESSizeFromRanges(p.videoRanges)
}
// FilteredVideoRangesCount returns the number of filtered video ranges.
func (p *MPEGTSParser) FilteredVideoRangesCount() int {
return len(p.filteredVideoRanges)
}
// AudioSubStreamCount returns the number of audio sub-streams.
func (p *MPEGTSParser) AudioSubStreamCount() int {
return len(p.audioSubStreams)
}
// VideoPID returns the video PID detected from the PMT.
func (p *MPEGTSParser) VideoPID() uint16 {
return p.videoPID
}
// AudioPIDs returns the audio PIDs detected from the PMT.
func (p *MPEGTSParser) AudioPIDs() []uint16 {
return p.audioPIDs
}
// VideoCodec returns the video codec type detected from the PMT.
func (p *MPEGTSParser) VideoCodec() CodecType {
return p.videoCodec
}
// Ensure MPEGTSParser implements the required interfaces at compile time.
var (
_ ESReader = (*MPEGTSParser)(nil)
_ ESRangeConverter = (*MPEGTSParser)(nil)
)
package source
import (
"fmt"
"io"
"os"
"path/filepath"
"strings"
)
// detectBlurayCodecs scans PMTs from indexed M2TS files to detect codecs.
// This is a fallback for when the pre-index DetectSourceCodecsFromDir check
// was skipped (e.g., detection failure).
func detectBlurayCodecs(index *Index) (*SourceCodecs, error) {
if len(index.Files) == 0 {
return nil, fmt.Errorf("no source files in index")
}
// Deduplicate by path — for ISOs the indexer creates multiple entries
// sharing the same RelativePath, and we only need to scan each file once.
seen := make(map[string]struct{})
var targets []codecScanTarget
for _, f := range index.Files {
fullPath := filepath.Join(index.SourceDir, f.RelativePath)
if _, ok := seen[fullPath]; ok {
continue
}
seen[fullPath] = struct{}{}
targets = append(targets, codecScanTarget{
Path: fullPath,
Size: f.Size,
})
}
return detectBlurayCodecsMulti(significantTargets(targets))
}
// detectBlurayCodecsMulti scans multiple M2TS files or ISOs and unions their
// codec information. ISO files are handled correctly via detectBlurayCodecsFromFile
// which parses their internal M2TS structure. Returns an error if no file could
// be scanned.
func detectBlurayCodecsMulti(targets []codecScanTarget) (*SourceCodecs, error) {
if len(targets) == 0 {
return nil, fmt.Errorf("no Blu-ray media files to scan")
}
merged := &SourceCodecs{}
var lastErr error
anySuccess := false
for _, t := range targets {
codecs, err := detectBlurayCodecsFromFile(t.Path)
if err != nil {
lastErr = err
continue
}
mergeSourceCodecs(merged, codecs)
anySuccess = true
}
if !anySuccess {
if lastErr != nil {
return nil, fmt.Errorf("failed to scan any Blu-ray codecs: %w", lastErr)
}
return nil, fmt.Errorf("failed to scan any Blu-ray codecs")
}
return merged, nil
}
// detectBlurayCodecsFromFile detects codecs from a single M2TS file or a
// Blu-ray ISO. For ISOs, it first tries parsing CLPI metadata files which
// authoritatively declare all streams, falling back to PMT scanning.
func detectBlurayCodecsFromFile(path string) (*SourceCodecs, error) {
if strings.HasSuffix(strings.ToLower(path), ".iso") {
return detectBlurayCodecsFromISO(path)
}
return scanM2TSCodecs(path, 0)
}
// detectBlurayCodecsFromISO detects codecs from a Blu-ray ISO. Tries CLPI
// metadata first (fast, authoritative), falls back to PMT scanning from M2TS data.
func detectBlurayCodecsFromISO(path string) (*SourceCodecs, error) {
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("open ISO: %w", err)
}
defer f.Close()
// Try CLPI-based detection (ISO9660, then UDF).
if clpis, err := findCLPIsInISO(f); err == nil && len(clpis) > 0 {
if codecs, err := detectBlurayCodecsFromCLPIs(f, clpis); err == nil {
return codecs, nil
}
}
if clpis, err := findCLPIsInUDF(f); err == nil && len(clpis) > 0 {
if codecs, err := detectBlurayCodecsFromCLPIs(f, clpis); err == nil {
return codecs, nil
}
}
// Fallback: scan PMT from M2TS data.
return detectBlurayCodecsFromISOPMT(path)
}
// detectBlurayCodecsFromISOPMT scans PMT data from M2TS files within a Blu-ray ISO.
// This is the legacy approach, kept as a fallback for ISOs where CLPI parsing fails.
func detectBlurayCodecsFromISOPMT(path string) (*SourceCodecs, error) {
m2tsFiles, err := findBlurayM2TSInISO(path)
if err != nil {
return nil, fmt.Errorf("find M2TS in ISO: %w", err)
}
if len(m2tsFiles) == 0 {
return nil, fmt.Errorf("no M2TS files found in Blu-ray ISO")
}
merged := &SourceCodecs{}
var lastErr error
anySuccess := false
for _, m := range significantFiles(m2tsFiles) {
codecs, err := scanM2TSCodecs(path, m.Offset)
if err != nil {
lastErr = err
continue
}
mergeSourceCodecs(merged, codecs)
anySuccess = true
}
if !anySuccess {
if lastErr != nil {
return nil, fmt.Errorf("failed to scan any M2TS in ISO: %w", lastErr)
}
return nil, fmt.Errorf("failed to scan any M2TS in ISO")
}
return merged, nil
}
// scanM2TSCodecs reads 2MB of M2TS data at the given offset and parses the
// PAT/PMT to extract codec information from a single M2TS stream.
func scanM2TSCodecs(path string, readOffset int64) (*SourceCodecs, error) {
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("open file: %w", err)
}
defer f.Close()
const scanSize = 2 * 1024 * 1024
buf := make([]byte, scanSize)
n, err := f.ReadAt(buf, readOffset)
if err == io.EOF || err == io.ErrUnexpectedEOF {
buf = buf[:n]
} else if err != nil {
return nil, fmt.Errorf("read M2TS data from %s: %w", path, err)
}
if n == 0 {
return nil, fmt.Errorf("no M2TS data at offset %d in %s", readOffset, path)
}
buf = buf[:n]
if len(buf) < 192*4 {
return nil, fmt.Errorf("M2TS data too small to detect TS structure (%d bytes)", len(buf))
}
return parseTSCodecs(buf)
}
// parseTSCodecs scans MPEG-TS data to find the PAT and PMT and extract stream types.
// This uses reassemblePSISection to correctly handle PMTs that span multiple TS
// packets (common on Blu-rays with many audio and subtitle streams).
func parseTSCodecs(data []byte) (*SourceCodecs, error) {
// Detect TS packet size: 188 (standard) or 192 (M2TS with 4-byte timestamp)
packetSize, startOffset := detectTSPacketSize(data)
if packetSize == 0 {
return nil, fmt.Errorf("cannot detect TS packet size")
}
tsOffset := 0
if packetSize == 192 {
tsOffset = 4
}
// Step 1: Find PAT (PID 0x0000) to get PMT PID
patSection, err := reassemblePSISection(data, startOffset, packetSize, tsOffset, 0, 0x00)
if err != nil {
return nil, fmt.Errorf("find PAT: %w", err)
}
pmtPID := pmtPIDFromPAT(patSection)
if pmtPID == 0 {
return nil, fmt.Errorf("PMT PID not found in PAT")
}
// Step 2: Reassemble complete PMT section (may span multiple TS packets)
pmtSection, err := reassemblePSISection(data, startOffset, packetSize, tsOffset, pmtPID, 0x02)
if err != nil {
return nil, fmt.Errorf("find PMT: %w", err)
}
// Step 3: Extract stream types from the reassembled PMT
codecs := &SourceCodecs{}
if len(pmtSection) >= 12 {
progInfoLen := int(pmtSection[10]&0x0F)<<8 | int(pmtSection[11])
streamsStart := 12 + progInfoLen
sectionLen := int(pmtSection[1]&0x0F)<<8 | int(pmtSection[2])
streamsEnd := 3 + sectionLen - 4 // exclude CRC32
if streamsEnd > len(pmtSection) {
streamsEnd = len(pmtSection)
}
for j := streamsStart; j+5 <= streamsEnd; {
streamType := pmtSection[j]
esInfoLen := int(pmtSection[j+3]&0x0F)<<8 | int(pmtSection[j+4])
ct := tsStreamTypeToCodecType(streamType)
if ct != CodecUnknown {
if IsVideoCodec(ct) {
if !containsCodec(codecs.VideoCodecs, ct) {
codecs.VideoCodecs = append(codecs.VideoCodecs, ct)
}
} else if IsAudioCodec(ct) {
if !containsCodec(codecs.AudioCodecs, ct) {
codecs.AudioCodecs = append(codecs.AudioCodecs, ct)
}
} else if IsSubtitleCodec(ct) {
if !containsCodec(codecs.SubtitleCodecs, ct) {
codecs.SubtitleCodecs = append(codecs.SubtitleCodecs, ct)
}
}
}
next := j + 5 + esInfoLen
if next < j || next > streamsEnd {
break
}
j = next
}
}
return codecs, nil
}
// tsStreamTypeToCodecType maps MPEG-TS stream type values to CodecType.
func tsStreamTypeToCodecType(streamType byte) CodecType {
switch streamType {
case 0x01:
return CodecMPEG1Video
case 0x02:
return CodecMPEG2Video
case 0x1B:
return CodecH264Video
case 0x24:
return CodecH265Video
case 0xEA:
return CodecVC1Video
case 0x03, 0x04:
return CodecMPEGAudio
case 0x0F:
return CodecAACaudio
case 0x80:
return CodecLPCMAudio
case 0x81:
return CodecAC3Audio
case 0x82:
return CodecDTSAudio
case 0x83:
return CodecTrueHDAudio
case 0x84:
return CodecEAC3Audio
case 0x85, 0x86:
return CodecDTSHDAudio
case 0x90:
return CodecPGSSubtitle
default:
return CodecUnknown
}
}
// detectTSPacketSize determines TS packet size (188 or 192) and the offset to
// the first sync byte. Returns (0, 0) if no valid TS structure is found.
func detectTSPacketSize(data []byte) (int, int) {
// Try both M2TS (192-byte packets) and standard TS (188-byte packets)
for _, size := range []int{192, 188} {
for startOffset := 0; startOffset < size && startOffset+size*3 < len(data); startOffset++ {
syncOffset := startOffset
if size == 192 {
syncOffset += 4 // M2TS timestamp prefix
}
if syncOffset >= len(data) || data[syncOffset] != 0x47 {
continue
}
// Verify 3 consecutive sync bytes
valid := true
for k := 1; k <= 3; k++ {
nextSync := startOffset + k*size
if size == 192 {
nextSync += 4
}
if nextSync >= len(data) || data[nextSync] != 0x47 {
valid = false
break
}
}
if valid {
return size, startOffset
}
}
}
return 0, 0
}
package source
import "log"
// splitDTSHDCoreStreams detects DTS-HD audio streams that contain an embedded
// DTS core and extracts the core into a separate sub-stream. On Blu-ray,
// DTS-HD streams (PMT types 0x85/0x86) embed DTS core frames followed by
// extension data (ExSS: XBR, XLL, XXCh) in the same PID. Video extraction tools may
// extract either the full DTS-HD stream (A_DTS/LOSSLESS) or just the DTS core
// (A_DTS).
//
// Unlike TrueHD+AC3 where the original is replaced, here we keep the original
// combined sub-stream (for A_DTS/LOSSLESS matching) and add a new core-only
// sub-stream (for A_DTS matching).
func (p *MPEGTSParser) splitDTSHDCoreStreams() {
for _, subID := range p.audioSubStreams {
if p.subStreamCodec[subID] != CodecDTSHDAudio {
continue
}
ranges := p.audioBySubStream[subID]
if len(ranges) == 0 {
continue
}
// Check if this stream actually has both DTS core and DTS-HD extension
if !p.detectCombinedDTSHDCore(ranges) {
continue
}
// Split out the DTS core ranges
coreRanges := p.splitDTSHDCoreRanges(ranges)
if len(coreRanges) == 0 {
continue
}
coreRanges = mergeAdjacentRanges(coreRanges)
// Keep original combined sub-stream for A_DTS/LOSSLESS matching.
// Add DTS core as a new sub-stream for A_DTS matching.
newSubID := byte(len(p.audioSubStreams))
p.audioBySubStream[newSubID] = coreRanges
p.subStreamCodec[newSubID] = CodecDTSAudio
p.audioSubStreams = append(p.audioSubStreams, newSubID)
}
}
// detectCombinedDTSHDCore checks if a DTS-HD audio stream contains both
// DTS core frames and DTS-HD extension (ExSS) frames by scanning the first
// few KB of ES data for both sync patterns.
func (p *MPEGTSParser) detectCombinedDTSHDCore(ranges []PESPayloadRange) bool {
hasDTSCore := false
hasDTSHDExSS := false
bytesChecked := 0
const maxCheck = 16 * 1024
for _, r := range ranges {
if bytesChecked >= maxCheck {
break
}
endOffset := r.FileOffset + int64(r.Size)
if endOffset > p.size {
continue
}
data := p.dataSlice(r.FileOffset, endOffset)
remaining := maxCheck - bytesChecked
if remaining < len(data) {
data = data[:remaining]
}
for i := 0; i < len(data)-3; i++ {
// DTS core sync: 7F FE 80 01
if data[i] == 0x7F && data[i+1] == 0xFE &&
data[i+2] == 0x80 && data[i+3] == 0x01 {
hasDTSCore = true
}
// DTS-HD ExSS sync: 64 58 20 25
if data[i] == 0x64 && data[i+1] == 0x58 &&
data[i+2] == 0x20 && data[i+3] == 0x25 {
hasDTSHDExSS = true
}
if hasDTSCore && hasDTSHDExSS {
return true
}
}
bytesChecked += len(data)
}
return false
}
// detectActualDTSCoreSize reads the beginning of a DTS-HD stream's ES data
// to determine the actual core frame size. In DTS-HD MA/HRA streams, the FSIZE
// field in the DTS core header reports the full access unit size (core + extension),
// not just the core portion. This function finds the real core boundary by
// scanning for the ExSS sync word (64 58 20 25) or the next DTS core sync word.
//
// Returns the actual core frame size in bytes, or 0 if it cannot be determined.
func (p *MPEGTSParser) detectActualDTSCoreSize(ranges []PESPayloadRange) int {
// Read up to 32KB of ES data — enough for several frames at any bitrate.
const maxRead = 32 * 1024
buf := make([]byte, 0, maxRead)
for _, r := range ranges {
if len(buf) >= maxRead {
break
}
endOffset := r.FileOffset + int64(r.Size)
if endOffset > p.size {
continue
}
data := p.dataSlice(r.FileOffset, endOffset)
remaining := maxRead - len(buf)
if len(data) > remaining {
data = data[:remaining]
}
buf = append(buf, data...)
}
// Find all DTS core sync positions to measure frame boundaries.
var syncPositions []int
for i := 0; i+6 < len(buf); i++ {
if buf[i] == 0x7F && buf[i+1] == 0xFE &&
buf[i+2] == 0x80 && buf[i+3] == 0x01 {
if DTSCoreFrameSize(buf[i:i+7]) > 0 {
syncPositions = append(syncPositions, i)
}
}
}
if len(syncPositions) == 0 {
return 0
}
dtsSyncPos := syncPositions[0]
// Find actual core boundary from the first frame by scanning for ExSS
// sync or next DTS sync.
coreSize := 0
for i := dtsSyncPos + 7; i+3 < len(buf); i++ {
// ExSS sync: 64 58 20 25
if buf[i] == 0x64 && buf[i+1] == 0x58 &&
buf[i+2] == 0x20 && buf[i+3] == 0x25 {
coreSize = i - dtsSyncPos
break
}
// Next DTS core sync: 7F FE 80 01 (validated)
if buf[i] == 0x7F && buf[i+1] == 0xFE &&
buf[i+2] == 0x80 && buf[i+3] == 0x01 {
if i+6 < len(buf) && DTSCoreFrameSize(buf[i:i+7]) > 0 {
coreSize = i - dtsSyncPos
break
}
}
}
if coreSize == 0 {
// Could not find boundary — fall back to FSIZE from header.
return DTSCoreFrameSize(buf[dtsSyncPos : dtsSyncPos+7])
}
// Validate that the detected core size is consistent across additional
// frames. DTS core on Blu-ray uses CBR, so the core portion of each
// access unit should be the same size. The DTS-HD extension data can
// vary in size (making total access units differ), so we validate the
// core boundary directly: at syncPos + coreSize we expect either an
// ExSS sync word (64 58 20 25) or the next DTS core sync word.
// The first frame's boundary is already validated (it produced coreSize),
// so if the buffer is too short to check any additional frames we still
// trust the measurement.
for _, sp := range syncPositions[1:] {
boundary := sp + coreSize
if boundary+3 >= len(buf) {
break
}
// ExSS sync at expected boundary — core size is correct (4 bytes needed)
if buf[boundary] == 0x64 && buf[boundary+1] == 0x58 &&
buf[boundary+2] == 0x20 && buf[boundary+3] == 0x25 {
continue
}
// Next DTS core sync at boundary — no extension in this frame,
// but core size still matches. Validate the header to avoid false
// positives from extension data containing the sync word pattern.
// Requires 7 bytes for DTSCoreFrameSize validation.
if boundary+6 < len(buf) &&
buf[boundary] == 0x7F && buf[boundary+1] == 0xFE &&
buf[boundary+2] == 0x80 && buf[boundary+3] == 0x01 &&
DTSCoreFrameSize(buf[boundary:boundary+7]) > 0 {
continue
}
// Neither marker at expected boundary — core size may be wrong.
// If there aren't enough bytes for DTS header validation, don't
// treat it as a mismatch — just stop checking.
if boundary+6 >= len(buf) {
break
}
log.Printf("mpegts: warning: DTS core boundary mismatch at offset %d (expected ExSS or DTS sync at +%d); skipping core extraction", sp, coreSize)
return 0
}
return coreSize
}
// splitDTSHDCoreRanges extracts DTS core frame ranges from a combined DTS-HD
// stream. It walks through PES payload ranges, identifies DTS core frames by
// their sync word, and collects only the core bytes (excluding DTS-HD extension
// data).
//
// In DTS-HD streams, the FSIZE header field reports the full access unit size
// (core + extension), not the core-only size. We detect the actual core size
// by scanning for the ExSS boundary in detectActualDTSCoreSize.
func (p *MPEGTSParser) splitDTSHDCoreRanges(ranges []PESPayloadRange) []PESPayloadRange {
// Detect actual core frame size by scanning the stream.
actualCoreSize := p.detectActualDTSCoreSize(ranges)
if actualCoreSize <= 0 {
return nil
}
var coreRanges []PESPayloadRange
var coreES int64 // cumulative ES offset for core output
coreRemaining := 0 // bytes remaining in current DTS core frame
// Buffer for DTS core header detection across range boundaries.
// We need bytes 0-6: 4-byte sync word + 3 bytes for frame size field.
var headerBuf [7]byte
headerBufLen := 0
var headerPendingRanges []PESPayloadRange
for _, r := range ranges {
endOffset := r.FileOffset + int64(r.Size)
if endOffset > p.size {
continue
}
data := p.dataSlice(r.FileOffset, endOffset)
pos := 0
// Handle header bytes buffered from previous range
if headerBufLen > 0 && coreRemaining == 0 {
need := 7 - headerBufLen
if need > len(data) {
// This range doesn't have enough bytes to complete the
// 7-byte header. Buffer these bytes and continue accumulating
// across subsequent ranges until we have a full 7-byte header.
copy(headerBuf[headerBufLen:], data)
headerBufLen += len(data)
headerPendingRanges = append(headerPendingRanges, r)
// Move to the next range; do not rescan these bytes individually.
continue
} else {
copy(headerBuf[headerBufLen:], data[:need])
if DTSCoreFrameSize(headerBuf[:7]) > 0 {
// Valid DTS core frame spanning range boundary.
// Add any intermediate pending ranges to core.
for _, pr := range headerPendingRanges {
coreRanges = append(coreRanges, PESPayloadRange{
FileOffset: pr.FileOffset,
Size: pr.Size,
ESOffset: coreES,
})
coreES += int64(pr.Size)
}
headerPendingRanges = nil
coreRanges = append(coreRanges, PESPayloadRange{
FileOffset: r.FileOffset,
Size: need,
ESOffset: coreES,
})
coreES += int64(need)
// Use detected core size, not FSIZE. Subtract the 7 header
// bytes already consumed (from buffer + current range).
coreRemaining = actualCoreSize - 7
pos = need
headerBufLen = 0
goto scanLoop
}
// Not a valid DTS core header — discard buffered bytes (they're extension data).
// Re-attribute the optimistic core range back (remove it).
if len(coreRanges) > 0 {
last := coreRanges[len(coreRanges)-1]
coreRanges = coreRanges[:len(coreRanges)-1]
coreES -= int64(last.Size)
}
headerPendingRanges = nil
headerBufLen = 0
}
}
scanLoop:
for pos < len(data) {
if coreRemaining > 0 {
// Inside a DTS core frame — consume bytes
consume := coreRemaining
if consume > len(data)-pos {
consume = len(data) - pos
}
coreRanges = append(coreRanges, PESPayloadRange{
FileOffset: r.FileOffset + int64(pos),
Size: consume,
ESOffset: coreES,
})
coreES += int64(consume)
coreRemaining -= consume
pos += consume
continue
}
// Look for DTS core sync word (need 7 bytes: 4-byte sync + 3 for frame size)
if pos+6 < len(data) &&
data[pos] == 0x7F && data[pos+1] == 0xFE &&
data[pos+2] == 0x80 && data[pos+3] == 0x01 {
if DTSCoreFrameSize(data[pos:pos+7]) > 0 {
coreRemaining = actualCoreSize
continue // will be consumed in coreRemaining branch
}
}
// Not DTS core data (extension or other) — skip forward to next
// potential DTS core sync word or end of range
pos++
for pos < len(data) {
if pos+6 < len(data) &&
data[pos] == 0x7F && data[pos+1] == 0xFE &&
data[pos+2] == 0x80 && data[pos+3] == 0x01 {
if DTSCoreFrameSize(data[pos:pos+7]) > 0 {
break
}
}
pos++
}
}
// After processing, check if trailing bytes could be a partial DTS core header
if coreRemaining == 0 && len(data) > 0 {
// Look for 0x7F (start of DTS sync word) near end of range.
// We need up to 7 bytes (4-byte sync + 3 bytes) for DTSCoreFrameSize(),
// so search the last 6 bytes in case the sync word starts at len(data)-6
// or len(data)-5 and continues into the next range.
checkStart := len(data) - 6
if checkStart < 0 {
checkStart = 0
}
bufStart := -1
for j := len(data) - 1; j >= checkStart; j-- {
if data[j] == 0x7F {
bufStart = j
break
}
}
if bufStart >= 0 {
tailLen := len(data) - bufStart
copy(headerBuf[:], data[bufStart:])
headerBufLen = tailLen
// Add trimmed bytes to core optimistically
coreRanges = append(coreRanges, PESPayloadRange{
FileOffset: r.FileOffset + int64(bufStart),
Size: tailLen,
ESOffset: coreES,
})
coreES += int64(tailLen)
}
}
}
// If we ended with buffered bytes, they weren't a valid DTS core header — remove
if headerBufLen > 0 {
if len(coreRanges) > 0 {
last := coreRanges[len(coreRanges)-1]
coreRanges = coreRanges[:len(coreRanges)-1]
coreES -= int64(last.Size)
}
}
return coreRanges
}
package source
import (
"bytes"
"fmt"
"log"
)
// Parse scans the file and extracts all PES payload ranges.
func (p *MPEGTSParser) Parse() error {
return p.ParseWithProgress(nil)
}
// ParseWithProgress scans the M2TS file with progress reporting.
func (p *MPEGTSParser) ParseWithProgress(progress MPEGTSProgressFunc) error {
if p.multiRegion != nil {
return p.parseMultiRegion(progress)
}
// Step 1: Detect TS packet size
detectLen := 192 * 16
if detectLen > len(p.data) {
detectLen = len(p.data)
}
packetSize, startOffset := detectTSPacketSize(p.data[:detectLen])
if packetSize == 0 {
return fmt.Errorf("cannot detect TS packet size")
}
p.packetSize = packetSize
if packetSize == 192 {
p.tsOffset = 4
}
// Step 2: Parse PAT/PMT to find stream PIDs
scanLen := 2 * 1024 * 1024
if scanLen > len(p.data) {
scanLen = len(p.data)
}
if err := p.parsePATandPMT(p.data[:scanLen], startOffset); err != nil {
return fmt.Errorf("parse PAT/PMT: %w", err)
}
// Step 3: Scan packets and build PES ranges
ss := p.initScanState()
p.scanPackets(p.data, startOffset, 0, ss, progress)
if progress != nil {
progress(p.size, p.size)
}
return p.finalizeParse()
}
// parseMultiRegion handles parsing when data comes from multiple non-contiguous
// mmap regions. Processes each region sequentially, handling TS packets that
// straddle region boundaries via a small carryover buffer.
func (p *MPEGTSParser) parseMultiRegion(progress MPEGTSProgressFunc) error {
mr := p.multiRegion
if len(mr.regions) == 0 {
return fmt.Errorf("no regions in multi-region data")
}
// Step 1: Detect TS packet size from first region
firstRegion := mr.regions[0].data
detectLen := 192 * 16
if detectLen > len(firstRegion) {
detectLen = len(firstRegion)
}
packetSize, startOffset := detectTSPacketSize(firstRegion[:detectLen])
if packetSize == 0 {
return fmt.Errorf("cannot detect TS packet size")
}
p.packetSize = packetSize
if packetSize == 192 {
p.tsOffset = 4
}
// Step 2: Parse PAT/PMT from first region
scanLen := 2 * 1024 * 1024
if scanLen > len(firstRegion) {
scanLen = len(firstRegion)
}
if err := p.parsePATandPMT(firstRegion[:scanLen], startOffset); err != nil {
return fmt.Errorf("parse PAT/PMT: %w", err)
}
// Step 3: Scan packets across all regions
ss := p.initScanState()
var carryover []byte
for i, reg := range mr.regions {
chunk := reg.data
logicalBase := reg.logicalStart
chunkStart := 0
if i == 0 {
// First region: skip to the initial start offset
chunkStart = startOffset
}
// Handle carryover from previous region boundary
if len(carryover) > 0 {
needed := p.packetSize - len(carryover)
if needed <= len(chunk) {
// Assemble the straddling packet and process it
bridgePkt := make([]byte, p.packetSize)
copy(bridgePkt, carryover)
copy(bridgePkt[len(carryover):], chunk[:needed])
bridgeBase := logicalBase - int64(len(carryover))
p.scanPackets(bridgePkt, 0, bridgeBase, ss, nil)
chunkStart = needed
carryover = nil
} else {
// Region too small to complete the packet — accumulate and continue
carryover = append(carryover, chunk...)
continue
}
}
// Process complete packets in this region
available := len(chunk) - chunkStart
nComplete := (available / p.packetSize) * p.packetSize
if nComplete > 0 {
p.scanPackets(chunk[chunkStart:chunkStart+nComplete], 0, logicalBase+int64(chunkStart), ss, progress)
}
// Save any remainder for the next region
remainder := available - nComplete
if remainder > 0 {
carryover = make([]byte, remainder)
copy(carryover, chunk[chunkStart+nComplete:])
}
}
if len(carryover) > 0 {
log.Printf("mpegts: warning: discarding %d carryover bytes at end of multi-region data (incomplete TS packet)", len(carryover))
}
if progress != nil {
progress(p.size, p.size)
}
return p.finalizeParse()
}
// pesState tracks PES header parsing state across TS packets.
type pesState struct {
headerBytesRemaining int
}
// scanState holds mutable state for the packet scanning loop.
type scanState struct {
trackedPIDs map[uint16]bool
pesStates map[uint16]*pesState
videoESOffset int64
audioESOffsets map[byte]int64
lastProgress int64
}
// initScanState sets up PID tracking and PES state for scanning.
func (p *MPEGTSParser) initScanState() *scanState {
if p.videoPID == 0 && len(p.audioPIDs) == 0 {
return nil
}
trackedPIDs := make(map[uint16]bool)
if p.videoPID != 0 {
trackedPIDs[p.videoPID] = true
}
for _, pid := range p.audioPIDs {
trackedPIDs[pid] = true
}
// Pre-allocate range slices
estimatedPackets := int(p.size) / p.packetSize
if p.videoPID != 0 {
p.videoRanges = make([]PESPayloadRange, 0, estimatedPackets*7/10)
}
for _, pid := range p.audioPIDs {
subID := p.pidToSubStream[pid]
p.audioBySubStream[subID] = make([]PESPayloadRange, 0, estimatedPackets/10/len(p.audioPIDs))
}
pesStates := make(map[uint16]*pesState)
for pid := range trackedPIDs {
pesStates[pid] = &pesState{}
}
return &scanState{
trackedPIDs: trackedPIDs,
pesStates: pesStates,
audioESOffsets: make(map[byte]int64),
}
}
// scanPackets processes TS packets in a data buffer, recording PES payload ranges.
// logicalBase is added to all FileOffset values to produce logical (assembled) offsets.
func (p *MPEGTSParser) scanPackets(data []byte, startPos int, logicalBase int64, ss *scanState, progress MPEGTSProgressFunc) {
if ss == nil {
return
}
for pos := startPos; pos+p.packetSize <= len(data); pos += p.packetSize {
tsStart := pos + p.tsOffset
if tsStart >= len(data) || data[tsStart] != 0x47 {
continue
}
pid := uint16(data[tsStart+1]&0x1F)<<8 | uint16(data[tsStart+2])
if !ss.trackedPIDs[pid] {
continue
}
pusi := data[tsStart+1]&0x40 != 0
adaptFieldCtrl := (data[tsStart+3] >> 4) & 0x03
// Find payload start
payloadOff := tsStart + 4
switch adaptFieldCtrl {
case 0x01: // payload only
case 0x03: // adaptation field + payload
if payloadOff < pos+p.packetSize {
adaptLen := int(data[payloadOff])
payloadOff += 1 + adaptLen
}
default: // 0x02 = adaptation only, 0x00 = reserved
continue
}
payloadEnd := pos + p.packetSize
if payloadEnd > len(data) {
payloadEnd = len(data)
}
if payloadOff >= payloadEnd {
continue
}
payload := data[payloadOff:payloadEnd]
state := ss.pesStates[pid]
// File offset in the logical (assembled) coordinate space
logPayloadOff := logicalBase + int64(payloadOff)
if pusi {
// New PES packet starts here
if len(payload) < 9 || payload[0] != 0 || payload[1] != 0 || payload[2] != 1 {
continue
}
pesHeaderDataLen := int(payload[8])
pesHeaderSize := 9 + pesHeaderDataLen
if pesHeaderSize >= len(payload) {
state.headerBytesRemaining = pesHeaderSize - len(payload)
continue
}
esPayload := payload[pesHeaderSize:]
fileOffset := logPayloadOff + int64(pesHeaderSize)
if pid == p.videoPID {
p.videoRanges = append(p.videoRanges, PESPayloadRange{
FileOffset: fileOffset,
Size: len(esPayload),
ESOffset: ss.videoESOffset,
})
ss.videoESOffset += int64(len(esPayload))
} else {
subID := p.pidToSubStream[pid]
p.audioBySubStream[subID] = append(p.audioBySubStream[subID], PESPayloadRange{
FileOffset: fileOffset,
Size: len(esPayload),
ESOffset: ss.audioESOffsets[subID],
})
ss.audioESOffsets[subID] += int64(len(esPayload))
}
state.headerBytesRemaining = 0
} else {
// Continuation packet
esPayload := payload
fileOffset := logPayloadOff
if state.headerBytesRemaining > 0 {
if state.headerBytesRemaining >= len(esPayload) {
state.headerBytesRemaining -= len(esPayload)
continue
}
esPayload = esPayload[state.headerBytesRemaining:]
fileOffset += int64(state.headerBytesRemaining)
state.headerBytesRemaining = 0
}
if len(esPayload) == 0 {
continue
}
if pid == p.videoPID {
p.videoRanges = append(p.videoRanges, PESPayloadRange{
FileOffset: fileOffset,
Size: len(esPayload),
ESOffset: ss.videoESOffset,
})
ss.videoESOffset += int64(len(esPayload))
} else {
subID := p.pidToSubStream[pid]
p.audioBySubStream[subID] = append(p.audioBySubStream[subID], PESPayloadRange{
FileOffset: fileOffset,
Size: len(esPayload),
ESOffset: ss.audioESOffsets[subID],
})
ss.audioESOffsets[subID] += int64(len(esPayload))
}
}
// Report progress
logPos := logicalBase + int64(pos)
if progress != nil && logPos-ss.lastProgress > 100*1024*1024 {
progress(logPos, p.size)
ss.lastProgress = logPos
}
}
}
// finalizeParse performs post-scan processing: video range filtering and
// TrueHD+AC3 stream splitting. Shared by contiguous and multi-region paths.
func (p *MPEGTSParser) finalizeParse() error {
if p.videoPID == 0 && len(p.audioPIDs) == 0 {
return fmt.Errorf("no video or audio PIDs found in PMT")
}
if err := p.buildFilteredVideoRanges(); err != nil {
return fmt.Errorf("build filtered video ranges: %w", err)
}
p.filterUserData = true
p.splitTrueHDAC3Streams()
p.splitDTSHDCoreStreams()
return nil
}
// parsePATandPMT finds the PAT and PMT in the first portion of the file
// and extracts video/audio PIDs and stream types.
func (p *MPEGTSParser) parsePATandPMT(data []byte, startOffset int) error {
// Find PAT (PID 0) and extract PMT PID
patSection, err := reassemblePSISection(data, startOffset, p.packetSize, p.tsOffset, 0, 0x00)
if err != nil {
return fmt.Errorf("reassemble PAT: %w", err)
}
pmtPID := pmtPIDFromPAT(patSection)
if pmtPID == 0 {
return fmt.Errorf("PMT PID not found in PAT")
}
// Find PMT and extract stream types.
// PMT sections can span multiple TS packets, so we must reassemble.
pmtSection, err := reassemblePSISection(data, startOffset, p.packetSize, p.tsOffset, pmtPID, 0x02)
if err != nil {
return fmt.Errorf("reassemble PMT: %w", err)
}
if len(pmtSection) >= 12 {
progInfoLen := int(pmtSection[10]&0x0F)<<8 | int(pmtSection[11])
streamsStart := 12 + progInfoLen
sectionLen := int(pmtSection[1]&0x0F)<<8 | int(pmtSection[2])
streamsEnd := 3 + sectionLen - 4 // exclude CRC32
if streamsEnd > len(pmtSection) {
streamsEnd = len(pmtSection)
}
var subStreamSeq byte
for j := streamsStart; j+5 <= streamsEnd; {
streamType := pmtSection[j]
esPID := uint16(pmtSection[j+1]&0x1F)<<8 | uint16(pmtSection[j+2])
esInfoLen := int(pmtSection[j+3]&0x0F)<<8 | int(pmtSection[j+4])
ct := tsStreamTypeToCodecType(streamType)
if ct != CodecUnknown {
if IsVideoCodec(ct) && p.videoPID == 0 {
p.videoPID = esPID
p.videoCodec = ct
} else if IsAudioCodec(ct) || IsSubtitleCodec(ct) {
p.audioPIDs = append(p.audioPIDs, esPID)
p.pidToSubStream[esPID] = subStreamSeq
p.subStreamToPID[subStreamSeq] = esPID
p.subStreamCodec[subStreamSeq] = ct
p.audioSubStreams = append(p.audioSubStreams, subStreamSeq)
subStreamSeq++
}
}
next := j + 5 + esInfoLen
if next < j || next > streamsEnd {
break
}
j = next
}
}
return nil
}
// reassemblePSISection collects a complete PSI section (PAT, PMT, etc.) from
// one or more TS packets. packetSize is 188 (standard TS) or 192 (M2TS).
// tsOffset is the offset from packet start to TS sync byte (4 for M2TS, 0 for TS).
func reassemblePSISection(data []byte, startOffset, packetSize, tsOffset int, targetPID uint16, tableID byte) ([]byte, error) {
var section []byte
sectionLen := -1
collecting := false
for i := startOffset; i+packetSize <= len(data); i += packetSize {
tsStart := i + tsOffset
if tsStart+188 > len(data) || data[tsStart] != 0x47 {
continue
}
pid := uint16(data[tsStart+1]&0x1F)<<8 | uint16(data[tsStart+2])
if pid != targetPID {
continue
}
pusi := data[tsStart+1]&0x40 != 0
adaptFieldCtrl := (data[tsStart+3] >> 4) & 0x03
hdrLen := 4
switch adaptFieldCtrl {
case 0x02: // Adaptation field only, no payload
continue
case 0x03: // Adaptation field + payload
if tsStart+4 >= len(data) {
continue
}
hdrLen = 5 + int(data[tsStart+4])
case 0x01: // Payload only
default:
continue
}
if tsStart+hdrLen >= tsStart+188 {
continue
}
payload := data[tsStart+hdrLen : tsStart+188]
if pusi {
// PUSI packet: pointer_field indicates how many bytes at the start
// of the payload belong to the tail of a previous section.
pointerField := int(payload[0])
sectionStart := 1 + pointerField
if sectionStart > len(payload) {
continue
}
// If we're mid-collection, the bytes before sectionStart are the
// tail of the section we're assembling.
if collecting && pointerField > 0 {
tail := payload[1:sectionStart]
remaining := sectionLen - len(section)
if len(tail) > remaining {
tail = tail[:remaining]
}
section = append(section, tail...)
if len(section) >= sectionLen {
return section, nil
}
}
payload = payload[sectionStart:]
if len(payload) < 3 || payload[0] != tableID {
continue
}
sectionLen = 3 + (int(payload[1]&0x0F)<<8 | int(payload[2]))
section = make([]byte, 0, sectionLen)
collecting = true
// Append what we have from this packet
n := len(payload)
if n > sectionLen {
n = sectionLen
}
section = append(section, payload[:n]...)
} else if collecting {
// Continuation packet
remaining := sectionLen - len(section)
n := len(payload)
if n > remaining {
n = remaining
}
section = append(section, payload[:n]...)
}
if collecting && len(section) >= sectionLen {
return section, nil
}
}
if collecting {
return nil, fmt.Errorf("truncated PSI section for table ID 0x%02X on PID 0x%04X: got %d of %d bytes", tableID, targetPID, len(section), sectionLen)
}
return nil, fmt.Errorf("PSI section with table ID 0x%02X not found on PID 0x%04X", tableID, targetPID)
}
// pmtPIDFromPAT extracts the PMT PID from a reassembled PAT section.
// Returns the PID of the first non-zero program, or 0 if none found.
func pmtPIDFromPAT(patSection []byte) uint16 {
if len(patSection) < 8 {
return 0
}
sectionLen := int(patSection[1]&0x0F)<<8 | int(patSection[2])
progsEnd := 3 + sectionLen - 4 // section_length counts from byte 3; subtract 4 for CRC
if progsEnd > len(patSection) {
progsEnd = len(patSection)
}
for j := 8; j+4 <= progsEnd; j += 4 {
progNum := uint16(patSection[j])<<8 | uint16(patSection[j+1])
if progNum == 0 {
continue // Network PID, skip
}
return uint16(patSection[j+2]&0x1F)<<8 | uint16(patSection[j+3])
}
return 0
}
// buildFilteredVideoRanges creates filtered video ranges.
// For MPEG-2 video, this excludes user_data (00 00 01 B2) sections.
// For H.264/H.265, filtered ranges are the same as raw ranges (no filtering needed).
func (p *MPEGTSParser) buildFilteredVideoRanges() error {
if len(p.videoRanges) == 0 {
return nil
}
// Only MPEG-2 needs user_data filtering
if p.videoCodec != CodecMPEG2Video {
// For H.264/H.265/etc, no filtering needed — use raw ranges directly
p.filteredVideoRanges = p.videoRanges
return nil
}
// MPEG-2: scan for user_data sections and exclude them
// Same algorithm as MPEGPSParser.buildFilteredVideoRanges
filteredRanges := make([]PESPayloadRange, 0, len(p.videoRanges))
var filteredESOffset int64
for _, rawRange := range p.videoRanges {
endOffset := rawRange.FileOffset + int64(rawRange.Size)
if endOffset > p.size {
continue
}
data := p.dataSlice(rawRange.FileOffset, endOffset)
i := 2
rangeStart := 0
for i < len(data)-1 {
idx := bytes.IndexByte(data[i:], 0x01)
if idx < 0 {
break
}
pos := i + idx
if pos >= 2 && pos < len(data)-1 &&
data[pos-1] == 0x00 && data[pos-2] == 0x00 && data[pos+1] == UserDataStartCode {
startCodePos := pos - 2
if startCodePos > rangeStart {
filteredRanges = append(filteredRanges, PESPayloadRange{
FileOffset: rawRange.FileOffset + int64(rangeStart),
Size: startCodePos - rangeStart,
ESOffset: filteredESOffset,
})
filteredESOffset += int64(startCodePos - rangeStart)
}
i = pos + 2
for i < len(data)-1 {
idx := bytes.IndexByte(data[i:], 0x01)
if idx < 0 {
i = len(data)
break
}
nextPos := i + idx
if nextPos >= 2 && data[nextPos-1] == 0x00 && data[nextPos-2] == 0x00 {
i = nextPos - 2
break
}
i = nextPos + 1
}
rangeStart = i
} else {
i = pos + 1
}
}
if rangeStart < len(data) {
filteredRanges = append(filteredRanges, PESPayloadRange{
FileOffset: rawRange.FileOffset + int64(rangeStart),
Size: len(data) - rangeStart,
ESOffset: filteredESOffset,
})
filteredESOffset += int64(len(data) - rangeStart)
}
}
p.filteredVideoRanges = filteredRanges
return nil
}
package source
// splitTrueHDAC3Streams detects combined TrueHD+AC3 audio streams and splits
// them into separate sub-streams. On Blu-ray, TrueHD streams (PMT type 0x83)
// interleave an AC3 compatibility core in the same PID. Video extraction tools split
// these into separate MKV tracks, so we must split them here to match.
func (p *MPEGTSParser) splitTrueHDAC3Streams() {
for _, subID := range p.audioSubStreams {
if p.subStreamCodec[subID] != CodecTrueHDAudio {
continue
}
ranges := p.audioBySubStream[subID]
if len(ranges) == 0 {
continue
}
// Check if this stream actually has interleaved AC3
if !p.detectCombinedTrueHDAC3(ranges) {
continue
}
// Split the combined ranges
ac3Ranges, truehdRanges := p.splitCombinedAudioRanges(ranges)
if len(ac3Ranges) == 0 {
continue
}
// Merge adjacent ranges to reduce count
ac3Ranges = mergeAdjacentRanges(ac3Ranges)
truehdRanges = mergeAdjacentRanges(truehdRanges)
// Replace original sub-stream with TrueHD-only ranges
p.audioBySubStream[subID] = truehdRanges
// Add AC3 as a new sub-stream
newSubID := byte(len(p.audioSubStreams))
p.audioBySubStream[newSubID] = ac3Ranges
p.subStreamCodec[newSubID] = CodecAC3Audio
p.audioSubStreams = append(p.audioSubStreams, newSubID)
}
}
// detectCombinedTrueHDAC3 checks if a TrueHD audio stream contains interleaved
// AC3 frames by scanning the first few KB of ES data for both sync patterns.
func (p *MPEGTSParser) detectCombinedTrueHDAC3(ranges []PESPayloadRange) bool {
// Read up to 16KB of ES data to check for both patterns
hasAC3 := false
hasTrueHD := false
bytesChecked := 0
const maxCheck = 16 * 1024
for _, r := range ranges {
if bytesChecked >= maxCheck {
break
}
endOffset := r.FileOffset + int64(r.Size)
if endOffset > p.size {
continue
}
data := p.dataSlice(r.FileOffset, endOffset)
// Clamp to remaining check budget
remaining := maxCheck - bytesChecked
if remaining < len(data) {
data = data[:remaining]
}
for i := 0; i < len(data)-1; i++ {
if data[i] == 0x0B && data[i+1] == 0x77 {
hasAC3 = true
}
if i+3 < len(data) &&
data[i] == 0xF8 && data[i+1] == 0x72 &&
data[i+2] == 0x6F && data[i+3] == 0xBA {
hasTrueHD = true
}
if hasAC3 && hasTrueHD {
return true
}
}
bytesChecked += len(data)
}
return false
}
// splitCombinedAudioRanges splits PES payload ranges of a combined TrueHD+AC3
// stream into separate AC3 and TrueHD ranges using AU-aware parsing.
//
// The interleaved stream alternates between AC3 frames and TrueHD access units
// at unit boundaries. At each boundary, the parser checks for the AC3 sync word
// (0B 77) to identify AC3 frames, or reads the TrueHD AU length header to
// determine the AU size. This avoids false-positive AC3 detection inside TrueHD
// AU data, which the previous byte-scan approach was susceptible to.
func (p *MPEGTSParser) splitCombinedAudioRanges(ranges []PESPayloadRange) (ac3Ranges, truehdRanges []PESPayloadRange) {
var ac3ES, truehdES int64
ac3Remaining := 0 // bytes remaining in current AC3 frame
truehdRemaining := 0 // bytes remaining in current TrueHD AU
// Cross-boundary header buffer. At unit boundaries, we need 2 bytes
// to determine type (AC3 vs TrueHD), or 5 bytes if starting with
// AC3 sync 0B 77 (to read fscod+frmsizecod at byte 4).
var headerBuf [5]byte
headerBufLen := 0
type pendingRange struct {
fileOffset int64
size int
}
var pendingRanges []pendingRange
for _, r := range ranges {
endOffset := r.FileOffset + int64(r.Size)
if endOffset > p.size {
continue
}
data := p.dataSlice(r.FileOffset, endOffset)
pos := 0
// Resolve buffered header bytes from previous range
if headerBufLen > 0 && ac3Remaining == 0 && truehdRemaining == 0 {
// Determine how many total bytes we need
needTotal := 2
if headerBufLen >= 2 && headerBuf[0] == 0x0B && headerBuf[1] == 0x77 {
needTotal = 5
}
need := needTotal - headerBufLen
available := len(data) - pos
if need > available {
// Still not enough data — buffer more
copy(headerBuf[headerBufLen:], data[pos:])
headerBufLen += available
pendingRanges = append(pendingRanges, pendingRange{r.FileOffset + int64(pos), available})
continue
}
copy(headerBuf[headerBufLen:], data[pos:pos+need])
consumedFromCurrent := need
headerBufLen += need
// Re-check: we may now have 0B 77 and need more bytes
if headerBufLen >= 2 && headerBuf[0] == 0x0B && headerBuf[1] == 0x77 && headerBufLen < 5 {
moreNeed := 5 - headerBufLen
moreAvail := len(data) - pos - consumedFromCurrent
if moreNeed > moreAvail {
// Still not enough for full AC3 header
copy(headerBuf[headerBufLen:], data[pos+consumedFromCurrent:])
pendingRanges = append(pendingRanges, pendingRange{r.FileOffset + int64(pos), consumedFromCurrent + moreAvail})
headerBufLen += moreAvail
continue
}
copy(headerBuf[headerBufLen:], data[pos+consumedFromCurrent:pos+consumedFromCurrent+moreNeed])
consumedFromCurrent += moreNeed
headerBufLen += moreNeed
}
// Classify the unit
isAC3 := false
unitSize := 0
if headerBuf[0] == 0x0B && headerBuf[1] == 0x77 && headerBufLen >= 5 {
fscod := (headerBuf[4] >> 6) & 0x03
frmsizecod := headerBuf[4] & 0x3F
frameSize := AC3FrameSize(fscod, frmsizecod)
if frameSize > 0 {
isAC3 = true
unitSize = frameSize
}
}
if !isAC3 {
auLen := ParseTrueHDAULength(headerBuf[:2])
if auLen >= 4 {
unitSize = auLen
}
}
if unitSize > 0 {
// Attribute pending ranges + consumed bytes from current range
if isAC3 {
for _, pr := range pendingRanges {
ac3Ranges = append(ac3Ranges, PESPayloadRange{
FileOffset: pr.fileOffset,
Size: pr.size,
ESOffset: ac3ES,
})
ac3ES += int64(pr.size)
}
if consumedFromCurrent > 0 {
ac3Ranges = append(ac3Ranges, PESPayloadRange{
FileOffset: r.FileOffset + int64(pos),
Size: consumedFromCurrent,
ESOffset: ac3ES,
})
ac3ES += int64(consumedFromCurrent)
}
ac3Remaining = unitSize - headerBufLen
} else {
for _, pr := range pendingRanges {
truehdRanges = append(truehdRanges, PESPayloadRange{
FileOffset: pr.fileOffset,
Size: pr.size,
ESOffset: truehdES,
})
truehdES += int64(pr.size)
}
if consumedFromCurrent > 0 {
truehdRanges = append(truehdRanges, PESPayloadRange{
FileOffset: r.FileOffset + int64(pos),
Size: consumedFromCurrent,
ESOffset: truehdES,
})
truehdES += int64(consumedFromCurrent)
}
truehdRemaining = unitSize - headerBufLen
}
} else {
// Unrecognized — attribute all buffered bytes to TrueHD
for _, pr := range pendingRanges {
truehdRanges = append(truehdRanges, PESPayloadRange{
FileOffset: pr.fileOffset,
Size: pr.size,
ESOffset: truehdES,
})
truehdES += int64(pr.size)
}
if consumedFromCurrent > 0 {
truehdRanges = append(truehdRanges, PESPayloadRange{
FileOffset: r.FileOffset + int64(pos),
Size: consumedFromCurrent,
ESOffset: truehdES,
})
truehdES += int64(consumedFromCurrent)
}
}
pos += consumedFromCurrent
headerBufLen = 0
pendingRanges = nil
}
for pos < len(data) {
if ac3Remaining > 0 {
consume := min(ac3Remaining, len(data)-pos)
ac3Ranges = append(ac3Ranges, PESPayloadRange{
FileOffset: r.FileOffset + int64(pos),
Size: consume,
ESOffset: ac3ES,
})
ac3ES += int64(consume)
ac3Remaining -= consume
pos += consume
continue
}
if truehdRemaining > 0 {
consume := min(truehdRemaining, len(data)-pos)
truehdRanges = append(truehdRanges, PESPayloadRange{
FileOffset: r.FileOffset + int64(pos),
Size: consume,
ESOffset: truehdES,
})
truehdES += int64(consume)
truehdRemaining -= consume
pos += consume
continue
}
// At unit boundary — determine type
available := len(data) - pos
// Need at least 2 bytes to determine type
if available < 2 {
copy(headerBuf[:], data[pos:])
headerBufLen = available
pendingRanges = []pendingRange{{r.FileOffset + int64(pos), available}}
pos = len(data)
continue
}
// Check for AC3 sync word
if data[pos] == 0x0B && data[pos+1] == 0x77 {
if available < 5 {
// Need more bytes for AC3 header
copy(headerBuf[:], data[pos:pos+available])
headerBufLen = available
pendingRanges = []pendingRange{{r.FileOffset + int64(pos), available}}
pos = len(data)
continue
}
fscod := (data[pos+4] >> 6) & 0x03
frmsizecod := data[pos+4] & 0x3F
frameSize := AC3FrameSize(fscod, frmsizecod)
if frameSize > 0 {
ac3Remaining = frameSize
continue
}
}
// TrueHD AU: parse length from first 2 bytes
auLen := ParseTrueHDAULength(data[pos:])
if auLen >= 4 {
truehdRemaining = auLen
continue
}
// Unrecognized — consume byte-by-byte as TrueHD
truehdRanges = append(truehdRanges, PESPayloadRange{
FileOffset: r.FileOffset + int64(pos),
Size: 1,
ESOffset: truehdES,
})
truehdES++
pos++
}
}
// Attribute remaining buffered bytes to TrueHD
if headerBufLen > 0 {
for _, pr := range pendingRanges {
truehdRanges = append(truehdRanges, PESPayloadRange{
FileOffset: pr.fileOffset,
Size: pr.size,
ESOffset: truehdES,
})
truehdES += int64(pr.size)
}
}
return ac3Ranges, truehdRanges
}
package source
// mergeAdjacentRanges merges consecutive PESPayloadRange entries that are
// contiguous in both file offset and ES offset.
func mergeAdjacentRanges(ranges []PESPayloadRange) []PESPayloadRange {
if len(ranges) <= 1 {
return ranges
}
merged := make([]PESPayloadRange, 0, len(ranges)/2)
merged = append(merged, ranges[0])
for i := 1; i < len(ranges); i++ {
last := &merged[len(merged)-1]
r := ranges[i]
if r.FileOffset == last.FileOffset+int64(last.Size) &&
r.ESOffset == last.ESOffset+int64(last.Size) {
last.Size += r.Size
} else {
merged = append(merged, r)
}
}
return merged
}
package source
import (
"sort"
"sync/atomic"
)
// multiRegionData provides a virtual contiguous view over multiple
// non-contiguous byte slices from a memory-mapped ISO. Used for
// multi-extent UDF files where M2TS data is split across
// non-contiguous ISO regions.
type multiRegionData struct {
regions []multiRegion
totalSize int64
lastIdx atomic.Int32 // cached region index for fast sequential access
}
type multiRegion struct {
data []byte
logicalStart int64 // cumulative offset in the virtual contiguous view
}
// newMultiRegionData creates a multiRegionData from ISO physical extents.
// Each extent becomes a region backed by a sub-slice of isoData (zero-copy).
func newMultiRegionData(extents []isoPhysicalRange, isoData []byte) *multiRegionData {
mr := &multiRegionData{
regions: make([]multiRegion, len(extents)),
}
logicalOff := int64(0)
isoLen := int64(len(isoData))
for i, ext := range extents {
end := ext.ISOOffset + ext.Length
if ext.ISOOffset < 0 || end > isoLen {
// Clamp to ISO bounds (corrupted/malformed UDF metadata)
start := ext.ISOOffset
if start < 0 {
start = 0
}
if end > isoLen {
end = isoLen
}
if start >= end {
mr.regions[i] = multiRegion{logicalStart: logicalOff}
continue
}
mr.regions[i] = multiRegion{
data: isoData[start:end],
logicalStart: logicalOff,
}
logicalOff += end - start
continue
}
mr.regions[i] = multiRegion{
data: isoData[ext.ISOOffset:end],
logicalStart: logicalOff,
}
logicalOff += ext.Length
}
mr.totalSize = logicalOff
return mr
}
// Len returns the total logical size across all regions.
func (m *multiRegionData) Len() int64 { return m.totalSize }
// regionFor returns the index of the region containing the given logical offset.
// Returns len(m.regions) if the offset is beyond all regions.
func (m *multiRegionData) regionFor(off int64) int {
// Fast path: check cached index
cached := int(m.lastIdx.Load())
if cached < len(m.regions) {
r := m.regions[cached]
if off >= r.logicalStart && off < r.logicalStart+int64(len(r.data)) {
return cached
}
}
// Binary search
idx := sort.Search(len(m.regions), func(i int) bool {
return m.regions[i].logicalStart+int64(len(m.regions[i].data)) > off
})
if idx < len(m.regions) {
m.lastIdx.Store(int32(idx))
}
return idx
}
// ByteAt returns the byte at the given logical offset.
// Returns 0 if the offset is out of bounds.
func (m *multiRegionData) ByteAt(off int64) byte {
if off < 0 || off >= m.totalSize {
return 0
}
idx := m.regionFor(off)
if idx >= len(m.regions) {
return 0
}
r := m.regions[idx]
return r.data[off-r.logicalStart]
}
// Slice returns a byte slice for the given logical offset range [off, end).
// Returns a zero-copy sub-slice when the range falls within one region.
// Copies into a new buffer when the range straddles a region boundary.
func (m *multiRegionData) Slice(off, end int64) []byte {
if off < 0 || end < 0 || off >= end {
return nil
}
idx := m.regionFor(off)
if idx >= len(m.regions) {
return nil
}
r := m.regions[idx]
regionOff := off - r.logicalStart
regionEnd := end - r.logicalStart
if regionEnd <= int64(len(r.data)) {
// Fast path: entirely within one region (zero-copy)
return r.data[regionOff:regionEnd]
}
// Slow path: straddles region boundary — copy
size := int(end - off)
buf := make([]byte, size)
copied := copy(buf, r.data[regionOff:])
for i := idx + 1; i < len(m.regions) && copied < size; i++ {
r := m.regions[i]
n := copy(buf[copied:], r.data)
copied += n
}
return buf
}
// Package source provides functionality for indexing source media files (DVD ISOs, Blu-ray directories).
package source
import (
"errors"
"os"
"path/filepath"
"sort"
"strings"
"sync"
"github.com/stuckj/mkvdup/internal/mmap"
)
// Type represents the type of source media.
type Type int
// Source type constants.
const (
TypeDVD Type = iota // Contains .iso file
TypeBluray // Contains BDMV/STREAM/*.m2ts
)
func (t Type) String() string {
switch t {
case TypeDVD:
return "DVD"
case TypeBluray:
return "Blu-ray"
default:
return "Unknown"
}
}
// ErrUnknownSourceType is returned when the source directory type cannot be determined.
var ErrUnknownSourceType = errors.New("unknown source type: directory contains neither ISO nor BDMV structure")
// DetectType determines whether a directory contains a DVD ISO or Blu-ray structure.
// ISOs are inspected to determine if they contain DVD (VIDEO_TS) or Blu-ray (BDMV) content.
func DetectType(dir string) (Type, error) {
// Check for ISO files
isos, err := filepath.Glob(filepath.Join(dir, "*.iso"))
if err != nil {
return 0, err
}
// Also check for ISO in subdirectory (common structure)
subIsos, err := filepath.Glob(filepath.Join(dir, "*", "*.iso"))
if err != nil {
return 0, err
}
isos = append(isos, subIsos...)
// If we found ISOs, inspect them to determine type
if len(isos) > 0 {
// Check the first ISO to determine type
isoType, err := detectISOType(isos[0])
if err != nil {
// If we can't read the ISO, default to DVD (legacy behavior)
return TypeDVD, nil
}
return isoType, nil
}
// Check for Blu-ray directory structure
m2ts, err := filepath.Glob(filepath.Join(dir, "BDMV", "STREAM", "*.m2ts"))
if err != nil {
return 0, err
}
if len(m2ts) > 0 {
return TypeBluray, nil
}
return 0, ErrUnknownSourceType
}
// detectISOType examines an ISO file to determine if it's a DVD or Blu-ray.
// DVDs have VIDEO_TS directory, Blu-rays have BDMV directory.
// Uses minimal reads to avoid loading the entire ISO into memory.
func detectISOType(isoPath string) (Type, error) {
f, err := os.Open(isoPath)
if err != nil {
return 0, err
}
defer f.Close()
// ISO9660 primary volume descriptor is at sector 16 (2048 bytes per sector)
// The root directory record is embedded in the volume descriptor at offset 156.
const sectorSize = 2048
const pvdOffset = 16 * sectorSize
// Read the primary volume descriptor
pvd := make([]byte, sectorSize)
if _, err := f.ReadAt(pvd, pvdOffset); err != nil {
return 0, err
}
// Check volume descriptor type (byte 0) and signature "CD001" (bytes 1-5)
if pvd[0] != 1 || string(pvd[1:6]) != "CD001" {
// No ISO9660 PVD. Check for UDF (Blu-ray ISOs from CloneBD).
if isUDFImage(f) {
return detectUDFISOType(f)
}
return TypeDVD, nil
}
// Root directory record is at offset 156, length at byte 0 of the record
rootDirRecord := pvd[156:]
if len(rootDirRecord) < 34 {
return TypeDVD, nil
}
// Extract root directory extent location (bytes 2-5, little-endian)
rootExtent := uint32(rootDirRecord[2]) | uint32(rootDirRecord[3])<<8 |
uint32(rootDirRecord[4])<<16 | uint32(rootDirRecord[5])<<24
// Extract root directory data length (bytes 10-13, little-endian)
rootDataLen := uint32(rootDirRecord[10]) | uint32(rootDirRecord[11])<<8 |
uint32(rootDirRecord[12])<<16 | uint32(rootDirRecord[13])<<24
// Read the root directory
// Limit to first 16KB to avoid reading huge directories
if rootDataLen > 16*1024 {
rootDataLen = 16 * 1024
}
rootDir := make([]byte, rootDataLen)
if _, err := f.ReadAt(rootDir, int64(rootExtent)*sectorSize); err != nil {
return 0, err
}
// Parse directory entries looking for VIDEO_TS or BDMV
hasBDMV := false
hasVideoTS := false
offset := 0
for offset < len(rootDir) {
recLen := int(rootDir[offset])
if recLen == 0 {
// Move to next sector boundary
nextSector := ((offset / sectorSize) + 1) * sectorSize
if nextSector >= len(rootDir) {
break
}
offset = nextSector
continue
}
if offset+recLen > len(rootDir) {
break
}
// Name length is at offset 32
if offset+33 > len(rootDir) {
break
}
nameLen := int(rootDir[offset+32])
if offset+33+nameLen > len(rootDir) {
break
}
// Extract and check the filename
name := strings.ToUpper(string(rootDir[offset+33 : offset+33+nameLen]))
// Strip version number (;1) if present
if idx := strings.Index(name, ";"); idx >= 0 {
name = name[:idx]
}
// Strip trailing dot if present
name = strings.TrimSuffix(name, ".")
if name == "BDMV" {
hasBDMV = true
}
if name == "VIDEO_TS" {
hasVideoTS = true
}
offset += recLen
}
// Blu-ray takes precedence if both are present
if hasBDMV {
return TypeBluray, nil
}
if hasVideoTS {
return TypeDVD, nil
}
// Default to DVD for unrecognized ISOs
return TypeDVD, nil
}
// File represents a source file within the source directory.
type File struct {
RelativePath string // Path relative to source directory
Size int64
Checksum uint64 // xxhash of file for integrity
}
// Location represents a position within a source file where a hash was found.
type Location struct {
FileIndex uint16 // Index into Files array
Offset int64 // Offset within that file (or ES offset for MPEG-PS)
IsVideo bool // For ES-based indexes: true for video ES, false for audio ES
AudioSubStreamID byte // For audio in MPEG-PS: sub-stream ID (0x80-0x87 = AC3, etc.)
}
// ESRangeConverter provides an interface for converting ES offsets to raw file offsets.
// This is used during dedup file creation to convert ES-based entries to raw-offset entries.
type ESRangeConverter interface {
// RawRangesForESRegion returns the raw file ranges that contain the given ES region.
// Each returned range represents a contiguous chunk of raw file data.
// The sum of all returned range sizes equals the requested ES region size.
// For video streams only - audio should use RawRangesForAudioSubStream.
RawRangesForESRegion(esOffset int64, size int, isVideo bool) ([]RawRange, error)
// RawRangesForAudioSubStream returns the raw file ranges for audio data from a specific sub-stream.
RawRangesForAudioSubStream(subStreamID byte, esOffset int64, size int) ([]RawRange, error)
}
// ESReader provides an interface for reading elementary stream data from container files.
type ESReader interface {
// ReadESData reads size bytes of ES data starting at esOffset.
// The data is continuous ES data, with container headers stripped.
// For video, this works as expected. For audio, use ReadAudioSubStreamData instead.
ReadESData(esOffset int64, size int, isVideo bool) ([]byte, error)
// ESOffsetToFileOffset converts an ES offset to a file offset and remaining bytes in that segment.
ESOffsetToFileOffset(esOffset int64, isVideo bool) (fileOffset int64, remaining int)
// TotalESSize returns the total size of the elementary stream.
// For video, returns filtered video ES size. For audio, returns 0 - use AudioSubStreamESSize.
TotalESSize(isVideo bool) int64
// AudioSubStreams returns the list of audio sub-stream IDs in order of appearance.
AudioSubStreams() []byte
// AudioSubStreamESSize returns the ES size for a specific audio sub-stream.
AudioSubStreamESSize(subStreamID byte) int64
// ReadAudioSubStreamData reads audio data from a specific sub-stream.
ReadAudioSubStreamData(subStreamID byte, esOffset int64, size int) ([]byte, error)
}
// PESRangeProvider provides access to PES payload ranges for building range maps.
// Both MPEGPSParser and MPEGTSParser implement this.
type PESRangeProvider interface {
FilteredVideoRanges() []PESPayloadRange
FilteredAudioRanges(subStreamID byte) []PESPayloadRange
AudioSubStreams() []byte
}
// FileOffsetAdjuster provides a function to convert parser-relative FileOffset
// values to source-file-relative offsets for range map storage.
// Implemented by isoM2TSAdapter where the parser operates on a sub-region
// of the ISO and FileOffset values need to be adjusted to ISO-relative.
type FileOffsetAdjuster interface {
FileOffsetConverter() func(int64) int64
}
// RawReader provides an interface for reading raw file data.
type RawReader interface {
ReadAt(buf []byte, offset int64) (int, error)
// Slice returns a zero-copy slice of the underlying data.
// Returns nil if offset is out of range.
Slice(offset int64, size int) []byte
Len() int
Close() error
}
// Index holds the hash-to-location mapping for fast lookup of byte sequences.
type Index struct {
// HashToLocations maps from xxhash to list of locations where that hash was found
HashToLocations map[uint64][]Location
// SourceDir is the path to the source directory
SourceDir string
// SourceType indicates whether this is DVD or Blu-ray
SourceType Type
// Files lists all media files in the source
Files []File
// WindowSize is the number of bytes used for hashing
WindowSize int
// ESReaders provides ES-aware reading for each file (nil for raw files)
// For MPEG-PS files, this allows reading continuous ES data.
ESReaders []ESReader
// RawReaders provides raw file reading for each file.
// Used when raw file indexing is enabled.
RawReaders []RawReader
// MmapFiles holds the mmap file handles for proper cleanup.
// These back the ESReaders for MPEG-PS files.
MmapFiles []*mmap.File
// UsesESOffsets indicates whether Location.Offset values are ES offsets
// rather than raw file offsets. True for DVD (MPEG-PS) sources.
UsesESOffsets bool
// sortOnce ensures SortLocationsByOffset runs only once.
sortOnce sync.Once
}
// NewIndex creates a new empty Index for the given source directory.
func NewIndex(sourceDir string, sourceType Type, windowSize int) *Index {
return &Index{
HashToLocations: make(map[uint64][]Location),
SourceDir: sourceDir,
SourceType: sourceType,
WindowSize: windowSize,
}
}
// SortLocationsByOffset sorts all location slices by (FileIndex, Offset).
// This is a one-time cost at match setup time that enables binary search
// for nearby locations during matching. Must be called before concurrent access.
func (idx *Index) SortLocationsByOffset() {
idx.sortOnce.Do(func() {
for hash, locs := range idx.HashToLocations {
if len(locs) > 1 {
sort.Slice(locs, func(i, j int) bool {
if locs[i].FileIndex != locs[j].FileIndex {
return locs[i].FileIndex < locs[j].FileIndex
}
return locs[i].Offset < locs[j].Offset
})
idx.HashToLocations[hash] = locs
}
}
})
}
// EnumerateMediaFiles returns the list of media files to index based on source type.
func EnumerateMediaFiles(dir string, sourceType Type) ([]string, error) {
var files []string
switch sourceType {
case TypeDVD:
// Look for ISO files
isos, err := filepath.Glob(filepath.Join(dir, "*.iso"))
if err != nil {
return nil, err
}
files = append(files, isos...)
// Also check subdirectory
isos, err = filepath.Glob(filepath.Join(dir, "*", "*.iso"))
if err != nil {
return nil, err
}
files = append(files, isos...)
case TypeBluray:
// Look for m2ts files in BDMV/STREAM (extracted Blu-ray)
m2ts, err := filepath.Glob(filepath.Join(dir, "BDMV", "STREAM", "*.m2ts"))
if err != nil {
return nil, err
}
files = append(files, m2ts...)
// If no extracted M2TS files, look for Blu-ray ISOs
if len(files) == 0 {
isos, err := filepath.Glob(filepath.Join(dir, "*.iso"))
if err != nil {
return nil, err
}
files = append(files, isos...)
// Also check subdirectory (same pattern as DVD)
isos, err = filepath.Glob(filepath.Join(dir, "*", "*.iso"))
if err != nil {
return nil, err
}
files = append(files, isos...)
}
}
// Convert to relative paths
relFiles := make([]string, 0, len(files))
for _, f := range files {
rel, err := filepath.Rel(dir, f)
if err != nil {
return nil, err
}
relFiles = append(relFiles, rel)
}
return relFiles, nil
}
// GetFileInfo returns size information for a file.
func GetFileInfo(path string) (int64, error) {
info, err := os.Stat(path)
if err != nil {
return 0, err
}
return info.Size(), nil
}
// ReadRawDataAt reads raw data from the source file at the given location.
// This is used for raw file indexing (non-ES mode).
// Note: This copies data. Prefer RawSlice for zero-copy access.
func (idx *Index) ReadRawDataAt(loc Location, size int) ([]byte, error) {
if int(loc.FileIndex) >= len(idx.RawReaders) || idx.RawReaders[loc.FileIndex] == nil {
return nil, errors.New("no raw reader for file")
}
buf := make([]byte, size)
n, err := idx.RawReaders[loc.FileIndex].ReadAt(buf, loc.Offset)
if err != nil && n < size {
return buf[:n], err
}
return buf[:n], nil
}
// RawSlice returns a zero-copy slice of raw data at the given location.
// Returns nil if the location is out of range.
func (idx *Index) RawSlice(loc Location, size int) []byte {
if int(loc.FileIndex) >= len(idx.RawReaders) || idx.RawReaders[loc.FileIndex] == nil {
return nil
}
return idx.RawReaders[loc.FileIndex].Slice(loc.Offset, size)
}
package source
// FindPGSSyncPoints returns byte offsets of PGS segment boundaries in data.
// PGS segments have a 3-byte header: [type (1 byte)] [size (2 bytes BE)].
// Each segment start is a sync point. Valid segment types are:
// 0x14 (PDS), 0x15 (ODS), 0x16 (PCS), 0x17 (WDS), 0x80 (END).
func FindPGSSyncPoints(data []byte) []int {
var offsets []int
off := 0
for off+3 <= len(data) {
segType := data[off]
if !isValidPGSSegmentType(segType) {
break
}
offsets = append(offsets, off)
segSize := int(data[off+1])<<8 | int(data[off+2])
off += 3 + segSize
}
return offsets
}
func isValidPGSSegmentType(t byte) bool {
switch t {
case 0x14, 0x15, 0x16, 0x17, 0x80:
return true
}
return false
}
package source
// ParseTrueHDAULength extracts the access unit length in bytes from
// the first 2 bytes of a TrueHD AU header. The lower 12 bits encode
// the length in 16-bit words; multiply by 2 for bytes.
func ParseTrueHDAULength(header []byte) int {
if len(header) < 2 {
return 0
}
return ((int(header[0])<<8 | int(header[1])) & 0x0FFF) * 2
}
package source
import (
"encoding/binary"
"fmt"
"os"
"strings"
)
// UDF descriptor tag IDs.
const (
udfTagAVDP = 2
udfTagPartitionDesc = 5
udfTagLogicalVolume = 6
udfTagFileSetDesc = 256
udfTagFileEntry = 261
udfTagFID = 257
udfTagExtFileEntry = 266
)
// udfDescriptorTag is the 16-byte tag at the start of every UDF descriptor.
type udfDescriptorTag struct {
TagID uint16
Version uint16
}
// udfExtent represents a physical extent (offset + length) on disk.
type udfExtent struct {
Length uint32
Location uint32
}
// udfLongAD is a "long allocation descriptor" (16 bytes) used to reference
// data across partitions.
type udfLongAD struct {
Length uint32
Location uint32 // logical block number within partition
PartRef uint16 // partition reference number
}
// udfShortAD is a "short allocation descriptor" (8 bytes).
type udfShortAD struct {
Length uint32
Position uint32 // logical block number
}
// udfPartitionDesc holds fields from a UDF Partition Descriptor (tag 5).
type udfPartitionDesc struct {
PartitionNumber uint16
StartingLocation uint32 // physical sector number
}
// udfLogicalVolumeDesc holds fields from a UDF Logical Volume Descriptor (tag 6).
type udfLogicalVolumeDesc struct {
BlockSize uint32
FSDLocation udfLongAD // File Set Descriptor location
PartitionMaps []udfPartitionMap
}
// udfPartitionMap describes a partition map entry from the Logical Volume Descriptor.
type udfPartitionMap struct {
Type byte // 1 = physical, 2 = metadata/virtual/sparable
PartitionNum uint16
IsMetadata bool
MetaFileLoc uint32 // for metadata partitions: file location
}
// udfFileEntry holds parsed fields from a File Entry (tag 261) or
// Extended File Entry (tag 266).
type udfFileEntry struct {
ICBTag byte // file type (4=directory, 5=file)
InfoLength uint64
AllocDescs []byte // raw allocation descriptors
AllocType byte // 0=short_ad, 1=long_ad, 3=immediate/inline
PartRef uint16 // partition reference where this FE resides
}
// udfFID represents a File Identifier Descriptor (tag 257).
type udfFID struct {
Name string
IsDir bool
IsParent bool
ICBLocation udfLongAD
}
// isUDFImage checks whether the file contains UDF Volume Recognition Sequence
// markers (BEA01/NSR02/NSR03) in sectors 16+.
func isUDFImage(f *os.File) bool {
// VRS starts at sector 16. Scan up to sector 31 for BEA01 + NSR0x.
buf := make([]byte, isoSectorSize)
foundBEA := false
foundNSR := false
for sector := 16; sector < 32; sector++ {
n, err := f.ReadAt(buf, int64(sector)*isoSectorSize)
if err != nil || n < 6 {
continue
}
ident := string(buf[1:6])
switch ident {
case "BEA01":
foundBEA = true
case "NSR02", "NSR03":
foundNSR = true
}
if foundBEA && foundNSR {
return true
}
if ident == "TEA01" {
break
}
}
return foundBEA && foundNSR
}
// detectUDFISOType navigates the UDF root directory to determine whether
// the ISO contains a Blu-ray (BDMV/) or DVD (VIDEO_TS/) structure.
func detectUDFISOType(f *os.File) (Type, error) {
rootEntries, err := readUDFRootDir(f)
if err != nil {
return TypeDVD, nil // can't parse UDF, default to DVD
}
hasBDMV := false
hasVideoTS := false
for _, fid := range rootEntries {
name := strings.ToUpper(fid.Name)
if name == "BDMV" {
hasBDMV = true
}
if name == "VIDEO_TS" {
hasVideoTS = true
}
}
if hasBDMV {
return TypeBluray, nil
}
if hasVideoTS {
return TypeDVD, nil
}
return TypeDVD, nil
}
// findBlurayM2TSInUDF navigates the UDF filesystem to find M2TS files
// under BDMV/STREAM/. Returns isoFileExtent entries compatible with
// the ISO9660 code path.
func findBlurayM2TSInUDF(f *os.File) ([]isoFileExtent, error) {
ctx, err := newUDFContext(f)
if err != nil {
return nil, err
}
// Read root directory
rootFIDs, err := ctx.readDirectoryFromFE(ctx.rootFE)
if err != nil {
return nil, fmt.Errorf("read UDF root directory: %w", err)
}
// Navigate to BDMV
bdmvFE, err := ctx.lookupDir(rootFIDs, "BDMV")
if err != nil {
return nil, fmt.Errorf("find BDMV: %w", err)
}
bdmvFIDs, err := ctx.readDirectoryFromFE(bdmvFE)
if err != nil {
return nil, fmt.Errorf("read BDMV directory: %w", err)
}
// Navigate to STREAM
streamFE, err := ctx.lookupDir(bdmvFIDs, "STREAM")
if err != nil {
return nil, fmt.Errorf("find STREAM: %w", err)
}
streamFIDs, err := ctx.readDirectoryFromFE(streamFE)
if err != nil {
return nil, fmt.Errorf("read STREAM directory: %w", err)
}
// Collect M2TS files
var m2tsFiles []isoFileExtent
for _, fid := range streamFIDs {
if fid.IsDir || fid.IsParent {
continue
}
name := strings.ToUpper(fid.Name)
if !strings.HasSuffix(name, ".M2TS") {
continue
}
fe, err := ctx.readFileEntryAt(fid.ICBLocation)
if err != nil {
continue
}
// Collect all physical extents for this file.
extents, err := ctx.resolveAllExtents(fe)
if err != nil || len(extents) == 0 {
continue
}
m2ts := isoFileExtent{
Name: name,
Offset: extents[0].ISOOffset,
Size: int64(fe.InfoLength),
IsDir: false,
}
// Only populate Extents if the data is non-contiguous.
if !extentsContiguous(extents) {
m2ts.Extents = extents
}
m2tsFiles = append(m2tsFiles, m2ts)
}
if len(m2tsFiles) == 0 {
return nil, fmt.Errorf("no M2TS files found in UDF BDMV/STREAM/")
}
return m2tsFiles, nil
}
// udfContext holds the parsed UDF volume structures needed for navigation.
type udfContext struct {
f *os.File
blockSize uint32
partStart uint32 // physical sector of partition start
partitions []udfPartitionDesc
partMaps []udfPartitionMap
metaData []byte // loaded metadata partition file (nil if Type 1 only)
rootFE *udfFileEntry
}
// newUDFContext reads and parses the UDF volume structures.
func newUDFContext(f *os.File) (*udfContext, error) {
// Step 1: Read AVDP at sector 256 to find the VDS
vdsExtent, err := readAVDP(f)
if err != nil {
return nil, fmt.Errorf("read AVDP: %w", err)
}
// Step 2: Read VDS to get partition and logical volume descriptors
partDescs, lvd, err := readVDS(f, vdsExtent)
if err != nil {
return nil, fmt.Errorf("read VDS: %w", err)
}
if len(partDescs) == 0 {
return nil, fmt.Errorf("no partition descriptors found in VDS")
}
// Match the first physical partition map's PartitionNum to the
// corresponding partition descriptor. Fall back to partDescs[0].
partStart := partDescs[0].StartingLocation
if len(lvd.PartitionMaps) > 0 {
targetNum := lvd.PartitionMaps[0].PartitionNum
for _, pd := range partDescs {
if pd.PartitionNumber == targetNum {
partStart = pd.StartingLocation
break
}
}
}
ctx := &udfContext{
f: f,
blockSize: lvd.BlockSize,
partStart: partStart,
partitions: partDescs,
partMaps: lvd.PartitionMaps,
}
// Step 3: Load metadata partition if present
for _, pm := range lvd.PartitionMaps {
if pm.IsMetadata {
metaData, err := ctx.readMetadataFile(pm.MetaFileLoc)
if err != nil {
return nil, fmt.Errorf("read metadata partition: %w", err)
}
ctx.metaData = metaData
break
}
}
// Step 4: Read FSD to get root directory ICB
rootFE, err := ctx.readFSDAndRoot(lvd.FSDLocation)
if err != nil {
return nil, fmt.Errorf("read FSD/root: %w", err)
}
ctx.rootFE = rootFE
return ctx, nil
}
// readAVDP reads the Anchor Volume Descriptor Pointer at sector 256.
// Returns the extent of the Main Volume Descriptor Sequence.
func readAVDP(f *os.File) (udfExtent, error) {
buf := make([]byte, isoSectorSize)
if _, err := f.ReadAt(buf, 256*isoSectorSize); err != nil {
return udfExtent{}, fmt.Errorf("read sector 256: %w", err)
}
tag := parseDescriptorTag(buf)
if tag.TagID != udfTagAVDP {
return udfExtent{}, fmt.Errorf("sector 256: expected AVDP (tag 2), got tag %d", tag.TagID)
}
// Main VDS extent at offset 16 (8 bytes: length + location)
return udfExtent{
Length: binary.LittleEndian.Uint32(buf[16:20]),
Location: binary.LittleEndian.Uint32(buf[20:24]),
}, nil
}
// readVDS reads the Volume Descriptor Sequence and extracts partition
// descriptors and the logical volume descriptor.
func readVDS(f *os.File, extent udfExtent) ([]udfPartitionDesc, *udfLogicalVolumeDesc, error) {
var partDescs []udfPartitionDesc
var lvd *udfLogicalVolumeDesc
sectors := int(extent.Length) / isoSectorSize
if sectors > 64 {
sectors = 64
}
buf := make([]byte, isoSectorSize)
for i := 0; i < sectors; i++ {
offset := int64(extent.Location+uint32(i)) * isoSectorSize
if _, err := f.ReadAt(buf, offset); err != nil {
break
}
tag := parseDescriptorTag(buf)
switch tag.TagID {
case udfTagPartitionDesc:
pd := udfPartitionDesc{
PartitionNumber: binary.LittleEndian.Uint16(buf[22:24]),
StartingLocation: binary.LittleEndian.Uint32(buf[188:192]),
}
partDescs = append(partDescs, pd)
case udfTagLogicalVolume:
blockSize := binary.LittleEndian.Uint32(buf[212:216])
// FSD location at offset 248 (16-byte long_ad)
fsdLoc := parseLongAD(buf[248:264])
// Partition maps at offset 440
mapTableLen := binary.LittleEndian.Uint32(buf[264:268])
numMaps := binary.LittleEndian.Uint32(buf[268:272])
mapData := buf[440:]
if int(mapTableLen) < len(mapData) {
mapData = mapData[:mapTableLen]
}
partMaps := parsePartitionMaps(mapData, int(numMaps))
lvd = &udfLogicalVolumeDesc{
BlockSize: blockSize,
FSDLocation: fsdLoc,
PartitionMaps: partMaps,
}
case 8: // Terminating Descriptor
// handled below
}
if tag.TagID == 8 {
break
}
}
if lvd == nil {
return nil, nil, fmt.Errorf("no Logical Volume Descriptor found")
}
return partDescs, lvd, nil
}
// parsePartitionMaps parses the partition map table from the LVD.
func parsePartitionMaps(data []byte, count int) []udfPartitionMap {
var maps []udfPartitionMap
offset := 0
for i := 0; i < count && offset < len(data); i++ {
if offset+2 > len(data) {
break
}
mapType := data[offset]
mapLen := int(data[offset+1])
if mapLen == 0 || offset+mapLen > len(data) {
break
}
pm := udfPartitionMap{Type: mapType}
switch mapType {
case 1:
// Type 1: Physical partition (6 bytes)
if mapLen >= 6 {
pm.PartitionNum = binary.LittleEndian.Uint16(data[offset+4 : offset+6])
}
case 2:
// Type 2: Could be metadata, virtual, or sparable (64 bytes)
if mapLen >= 64 {
pm.PartitionNum = binary.LittleEndian.Uint16(data[offset+38 : offset+40])
// Check for metadata partition identifier at offset 4
ident := string(data[offset+4 : offset+36])
if strings.Contains(ident, "*UDF Metadata Partition") {
pm.IsMetadata = true
pm.MetaFileLoc = binary.LittleEndian.Uint32(data[offset+40 : offset+44])
}
}
}
maps = append(maps, pm)
offset += mapLen
}
return maps
}
// readMetadataFile loads the metadata virtual file from the partition.
// The metadata file is a File Entry at partStart + metaFileLoc, whose
// allocation descriptors point to the actual metadata data.
func (ctx *udfContext) readMetadataFile(metaFileLoc uint32) ([]byte, error) {
// Read the File Entry for the metadata file
physSector := ctx.partStart + metaFileLoc
buf := make([]byte, ctx.blockSize)
if _, err := ctx.f.ReadAt(buf, int64(physSector)*int64(ctx.blockSize)); err != nil {
return nil, fmt.Errorf("read metadata file entry at sector %d: %w", physSector, err)
}
fe, err := parseFileEntry(buf)
if err != nil {
return nil, fmt.Errorf("parse metadata file entry: %w", err)
}
// The metadata file's FE is on the physical partition. Find the
// physical (Type 1) partition map index so short_ad resolves correctly.
for i, pm := range ctx.partMaps {
if pm.Type == 1 {
fe.PartRef = uint16(i)
break
}
}
return ctx.readFileData(fe)
}
// readFSDAndRoot reads the File Set Descriptor and follows it to the root
// directory File Entry.
func (ctx *udfContext) readFSDAndRoot(fsdLoc udfLongAD) (*udfFileEntry, error) {
fsdData, err := ctx.readBlock(fsdLoc.Location, fsdLoc.PartRef)
if err != nil {
return nil, fmt.Errorf("read FSD block: %w", err)
}
tag := parseDescriptorTag(fsdData)
if tag.TagID != udfTagFileSetDesc {
return nil, fmt.Errorf("expected FSD (tag 256), got tag %d", tag.TagID)
}
// Root directory ICB at offset 400 (16-byte long_ad)
if len(fsdData) < 416 {
return nil, fmt.Errorf("FSD too short")
}
rootICB := parseLongAD(fsdData[400:416])
return ctx.readFileEntryAt(rootICB)
}
// readFileEntryAt reads and parses a File Entry at the given location.
func (ctx *udfContext) readFileEntryAt(loc udfLongAD) (*udfFileEntry, error) {
data, err := ctx.readBlock(loc.Location, loc.PartRef)
if err != nil {
return nil, fmt.Errorf("read file entry block %d (part %d): %w", loc.Location, loc.PartRef, err)
}
fe, err := parseFileEntry(data)
if err != nil {
return nil, err
}
fe.PartRef = loc.PartRef
return fe, nil
}
// readDirectoryFromFE reads directory data from a File Entry and parses FIDs.
func (ctx *udfContext) readDirectoryFromFE(fe *udfFileEntry) ([]udfFID, error) {
dirData, err := ctx.readFileData(fe)
if err != nil {
return nil, err
}
return parseUDFDirectory(dirData), nil
}
// lookupDir finds a named subdirectory in a list of FIDs and reads its File Entry.
func (ctx *udfContext) lookupDir(fids []udfFID, name string) (*udfFileEntry, error) {
upper := strings.ToUpper(name)
for _, fid := range fids {
if fid.IsParent {
continue
}
if strings.ToUpper(fid.Name) == upper {
return ctx.readFileEntryAt(fid.ICBLocation)
}
}
return nil, fmt.Errorf("%q not found in directory", name)
}
// maxAllocExtentChainDepth limits the number of type-3 allocation extent
// continuation hops to prevent infinite loops on corrupt/cyclic images.
// In practice even a badly fragmented 50 GB Blu-ray needs only 2-3 hops;
// 10000 is extremely conservative.
const maxAllocExtentChainDepth = 10000
// resolveAllExtents collects all physical extents for a file entry.
// For long_ad, each AD has an explicit partition reference.
// For short_ad, the partition is inherited from the FE.
// Handles allocation extent chaining (type 3 descriptors) for files
// whose allocation descriptors span multiple blocks.
func (ctx *udfContext) resolveAllExtents(fe *udfFileEntry) ([]isoPhysicalRange, error) {
allocDescs := fe.AllocDescs
switch fe.AllocType & 0x07 {
case 0: // short_ad
if int(fe.PartRef) < len(ctx.partMaps) && ctx.partMaps[fe.PartRef].IsMetadata {
return nil, fmt.Errorf("short_ad on metadata partition not supported for file extents")
}
var extents []isoPhysicalRange
remaining := int64(fe.InfoLength)
chainDepth := 0
visited := map[[2]uint32]bool{}
for remaining > 0 {
followed := false
for off := 0; off+8 <= len(allocDescs) && remaining > 0; off += 8 {
ad := parseShortAD(allocDescs[off : off+8])
extType := (ad.Length >> 30) & 0x03
extLen := int64(ad.Length & 0x3FFFFFFF)
if extLen == 0 {
break // end-of-descriptor-list marker
}
if extType == 3 {
chainDepth++
if chainDepth > maxAllocExtentChainDepth {
return nil, fmt.Errorf("short_ad alloc extent chain depth exceeded %d", maxAllocExtentChainDepth)
}
key := [2]uint32{uint32(fe.PartRef), ad.Position}
if visited[key] {
return nil, fmt.Errorf("cycle in short_ad alloc extent chain at block %d part %d", ad.Position, fe.PartRef)
}
visited[key] = true
nextDescs, err := ctx.readAllocExtentBlock(ad.Position, fe.PartRef)
if err != nil {
return nil, fmt.Errorf("follow short_ad alloc extent chain: %w", err)
}
allocDescs = nextDescs
followed = true
break
}
if extLen > remaining {
extLen = remaining
}
if extType == 0 {
// Type 0: recorded and allocated — actual data extent
extents = append(extents, isoPhysicalRange{
ISOOffset: ctx.resolveBlockPhysical(ad.Position),
Length: extLen,
})
}
// Type 1 (allocated, not recorded) and type 2 (not allocated)
// are sparse holes with no data on disc — skip without appending.
remaining -= extLen
}
if !followed {
break
}
}
if remaining > 0 {
return nil, fmt.Errorf("short_ad allocation descriptors truncated: %d bytes remaining", remaining)
}
return extents, nil
case 1: // long_ad
var extents []isoPhysicalRange
remaining := int64(fe.InfoLength)
chainDepth := 0
visited := map[[2]uint32]bool{}
for remaining > 0 {
followed := false
for off := 0; off+16 <= len(allocDescs) && remaining > 0; off += 16 {
ad := parseLongAD(allocDescs[off : off+16])
extType := (ad.Length >> 30) & 0x03
extLen := int64(ad.Length & 0x3FFFFFFF)
if extLen == 0 {
break // end-of-descriptor-list marker
}
if extType == 3 {
chainDepth++
if chainDepth > maxAllocExtentChainDepth {
return nil, fmt.Errorf("long_ad alloc extent chain depth exceeded %d", maxAllocExtentChainDepth)
}
key := [2]uint32{uint32(ad.PartRef), ad.Location}
if visited[key] {
return nil, fmt.Errorf("cycle in long_ad alloc extent chain at block %d part %d", ad.Location, ad.PartRef)
}
visited[key] = true
nextDescs, err := ctx.readAllocExtentBlock(ad.Location, ad.PartRef)
if err != nil {
return nil, fmt.Errorf("follow long_ad alloc extent chain: %w", err)
}
allocDescs = nextDescs
followed = true
break
}
if extLen > remaining {
extLen = remaining
}
if extType == 0 {
// Type 0: recorded and allocated — actual data extent
if int(ad.PartRef) < len(ctx.partMaps) && ctx.partMaps[ad.PartRef].IsMetadata {
return nil, fmt.Errorf("long_ad data extent on metadata partition")
}
extents = append(extents, isoPhysicalRange{
ISOOffset: ctx.resolveBlockPhysical(ad.Location),
Length: extLen,
})
}
// Type 1 (allocated, not recorded) and type 2 (not allocated)
// are sparse holes with no data on disc — skip without appending.
remaining -= extLen
}
if !followed {
break
}
}
if remaining > 0 {
return nil, fmt.Errorf("long_ad allocation descriptors truncated: %d bytes remaining", remaining)
}
return extents, nil
default:
return nil, fmt.Errorf("unsupported alloc type %d for extent resolution", fe.AllocType&0x07)
}
}
// readAllocExtentBlock reads a block containing continuation allocation
// descriptors (referenced by a type 3 extent). The block starts with an
// Allocation Extent Descriptor (tag 258) header, followed by the raw
// allocation descriptor data.
func (ctx *udfContext) readAllocExtentBlock(blockNum uint32, partRef uint16) ([]byte, error) {
data, err := ctx.readBlock(blockNum, partRef)
if err != nil {
return nil, fmt.Errorf("read alloc extent block %d (part %d): %w", blockNum, partRef, err)
}
// Allocation Extent Descriptor: tag 258
// Offset 0-15: descriptor tag
// Offset 16-19: previous allocation extent location (uint32)
// Offset 20-23: length of allocation descriptors (uint32)
// Offset 24+: allocation descriptors
if len(data) < 24 {
return nil, fmt.Errorf("alloc extent block too short")
}
tag := parseDescriptorTag(data)
if tag.TagID != 258 {
return nil, fmt.Errorf("expected Allocation Extent Descriptor (tag 258), got tag %d", tag.TagID)
}
adLen := binary.LittleEndian.Uint32(data[20:24])
remaining := len(data) - 24
if adLen > uint32(remaining) {
return nil, fmt.Errorf("allocation descriptor length %d exceeds remaining block bytes %d", adLen, remaining)
}
adLenInt := int(adLen)
return data[24 : 24+adLenInt], nil
}
// extentsContiguous returns true if all extents are physically adjacent.
func extentsContiguous(extents []isoPhysicalRange) bool {
for i := 1; i < len(extents); i++ {
prevEnd := extents[i-1].ISOOffset + extents[i-1].Length
if extents[i].ISOOffset != prevEnd {
return false
}
}
return true
}
// readBlock reads one block from the given logical block number within
// the specified partition reference.
func (ctx *udfContext) readBlock(blockNum uint32, partRef uint16) ([]byte, error) {
// Determine which partition map this references
if int(partRef) < len(ctx.partMaps) && ctx.partMaps[partRef].IsMetadata {
// Metadata partition: block is an offset into the loaded metadata data
byteOffset := int64(blockNum) * int64(ctx.blockSize)
if ctx.metaData == nil {
return nil, fmt.Errorf("metadata partition referenced but not loaded")
}
if byteOffset+int64(ctx.blockSize) > int64(len(ctx.metaData)) {
return nil, fmt.Errorf("metadata block %d out of range", blockNum)
}
result := make([]byte, ctx.blockSize)
copy(result, ctx.metaData[byteOffset:byteOffset+int64(ctx.blockSize)])
return result, nil
}
// Physical partition: blockNum is relative to partition start
physOffset := int64(ctx.partStart+blockNum) * int64(ctx.blockSize)
buf := make([]byte, ctx.blockSize)
if _, err := ctx.f.ReadAt(buf, physOffset); err != nil {
return nil, err
}
return buf, nil
}
// resolveBlockPhysical converts a logical block number to a physical byte offset
// using the default (first physical) partition.
func (ctx *udfContext) resolveBlockPhysical(blockNum uint32) int64 {
return int64(ctx.partStart+blockNum) * int64(ctx.blockSize)
}
// readFileData reads the complete data of a file described by a File Entry.
func (ctx *udfContext) readFileData(fe *udfFileEntry) ([]byte, error) {
if fe.InfoLength == 0 {
return nil, nil
}
switch fe.AllocType & 0x07 {
case 3: // inline/immediate
if uint64(len(fe.AllocDescs)) < fe.InfoLength {
return fe.AllocDescs, nil
}
return fe.AllocDescs[:fe.InfoLength], nil
case 0: // short_ad
return ctx.readFromShortADs(fe)
case 1: // long_ad
return ctx.readFromLongADs(fe)
default:
return nil, fmt.Errorf("unsupported allocation type %d", fe.AllocType&0x07)
}
}
// readFromShortADs reads file data described by short allocation descriptors.
// Short ADs don't carry an explicit partition reference — they inherit the
// partition of the File Entry that contains them.
func (ctx *udfContext) readFromShortADs(fe *udfFileEntry) ([]byte, error) {
// Determine if this FE's partition is the metadata partition.
isMeta := int(fe.PartRef) < len(ctx.partMaps) && ctx.partMaps[fe.PartRef].IsMetadata && ctx.metaData != nil
result := make([]byte, 0, fe.InfoLength)
remaining := int64(fe.InfoLength)
for off := 0; off+8 <= len(fe.AllocDescs) && remaining > 0; off += 8 {
ad := parseShortAD(fe.AllocDescs[off : off+8])
extLen := int64(ad.Length & 0x3FFFFFFF) // mask off extent type bits
if extLen == 0 {
break
}
toRead := min(extLen, remaining)
if isMeta {
// Resolve within the loaded metadata data.
byteOffset := int64(ad.Position) * int64(ctx.blockSize)
if byteOffset+toRead > int64(len(ctx.metaData)) {
return nil, fmt.Errorf("metadata short_ad extent out of range (offset %d, len %d, metaLen %d)",
byteOffset, toRead, len(ctx.metaData))
}
result = append(result, ctx.metaData[byteOffset:byteOffset+toRead]...)
} else {
physOffset := int64(ctx.partStart+ad.Position) * int64(ctx.blockSize)
buf := make([]byte, toRead)
if _, err := ctx.f.ReadAt(buf, physOffset); err != nil {
return nil, fmt.Errorf("read short_ad extent at offset %d: %w", physOffset, err)
}
result = append(result, buf...)
}
remaining -= toRead
}
return result, nil
}
// readFromLongADs reads file data described by long allocation descriptors.
func (ctx *udfContext) readFromLongADs(fe *udfFileEntry) ([]byte, error) {
result := make([]byte, 0, fe.InfoLength)
remaining := int64(fe.InfoLength)
for off := 0; off+16 <= len(fe.AllocDescs) && remaining > 0; off += 16 {
ad := parseLongAD(fe.AllocDescs[off : off+16])
extLen := int64(ad.Length & 0x3FFFFFFF) // mask off extent type bits
if extLen == 0 {
break
}
toRead := min(extLen, remaining)
// Check if this references the metadata partition
if int(ad.PartRef) < len(ctx.partMaps) && ctx.partMaps[ad.PartRef].IsMetadata && ctx.metaData != nil {
byteOffset := int64(ad.Location) * int64(ctx.blockSize)
if byteOffset+toRead > int64(len(ctx.metaData)) {
return nil, fmt.Errorf("metadata extent out of range")
}
result = append(result, ctx.metaData[byteOffset:byteOffset+toRead]...)
} else {
physOffset := int64(ctx.partStart+ad.Location) * int64(ctx.blockSize)
buf := make([]byte, toRead)
if _, err := ctx.f.ReadAt(buf, physOffset); err != nil {
return nil, fmt.Errorf("read long_ad extent at offset %d: %w", physOffset, err)
}
result = append(result, buf...)
}
remaining -= toRead
}
return result, nil
}
// readUDFRootDir is a convenience function that reads just the root directory
// entries from a UDF filesystem. Used by detectUDFISOType.
func readUDFRootDir(f *os.File) ([]udfFID, error) {
ctx, err := newUDFContext(f)
if err != nil {
return nil, err
}
return ctx.readDirectoryFromFE(ctx.rootFE)
}
// --- Low-level parsing helpers ---
// parseDescriptorTag parses the 16-byte UDF descriptor tag at the start of buf.
func parseDescriptorTag(buf []byte) udfDescriptorTag {
if len(buf) < 16 {
return udfDescriptorTag{}
}
return udfDescriptorTag{
TagID: binary.LittleEndian.Uint16(buf[0:2]),
Version: binary.LittleEndian.Uint16(buf[2:4]),
}
}
// parseLongAD parses a 16-byte long allocation descriptor.
func parseLongAD(buf []byte) udfLongAD {
return udfLongAD{
Length: binary.LittleEndian.Uint32(buf[0:4]),
Location: binary.LittleEndian.Uint32(buf[4:8]),
PartRef: binary.LittleEndian.Uint16(buf[8:10]),
}
}
// parseShortAD parses an 8-byte short allocation descriptor.
func parseShortAD(buf []byte) udfShortAD {
return udfShortAD{
Length: binary.LittleEndian.Uint32(buf[0:4]),
Position: binary.LittleEndian.Uint32(buf[4:8]),
}
}
// parseFileEntry parses a UDF File Entry (tag 261) or Extended File Entry (tag 266).
func parseFileEntry(data []byte) (*udfFileEntry, error) {
if len(data) < 16 {
return nil, fmt.Errorf("data too short for file entry")
}
tag := parseDescriptorTag(data)
if tag.TagID != udfTagFileEntry && tag.TagID != udfTagExtFileEntry {
return nil, fmt.Errorf("expected File Entry (tag 261/266), got tag %d", tag.TagID)
}
// ICB Tag at offset 16 (20 bytes), file type at ICB tag offset 11 (= data offset 27)
if len(data) < 28 {
return nil, fmt.Errorf("data too short for ICB tag")
}
fileType := data[27]
var infoLength uint64
var allocDescsOffset int
var allocDescsLength uint32
var icbFlags uint16
if tag.TagID == udfTagFileEntry {
// File Entry (tag 261)
// ECMA-167 14.9: L_EA at 168, L_AD at 172, alloc descs at 176+L_EA
if len(data) < 176 {
return nil, fmt.Errorf("file entry too short")
}
infoLength = binary.LittleEndian.Uint64(data[56:64])
icbFlags = binary.LittleEndian.Uint16(data[34:36])
eaLen := binary.LittleEndian.Uint32(data[168:172])
allocDescsLength = binary.LittleEndian.Uint32(data[172:176])
allocDescsOffset = 176 + int(eaLen)
} else {
// Extended File Entry (tag 266)
// ECMA-167 14.17: L_EA at 208, L_AD at 212, alloc descs at 216+L_EA
if len(data) < 216 {
return nil, fmt.Errorf("extended file entry too short")
}
infoLength = binary.LittleEndian.Uint64(data[56:64])
icbFlags = binary.LittleEndian.Uint16(data[34:36])
eaLen := binary.LittleEndian.Uint32(data[208:212])
allocDescsLength = binary.LittleEndian.Uint32(data[212:216])
allocDescsOffset = 216 + int(eaLen)
}
// Guard against overflow or out-of-bounds from malformed eaLen
if allocDescsOffset < 0 || allocDescsOffset > len(data) {
return nil, fmt.Errorf("file entry alloc descs offset out of bounds: %d", allocDescsOffset)
}
var allocDescs []byte
if allocDescsOffset+int(allocDescsLength) <= len(data) {
allocDescs = make([]byte, allocDescsLength)
copy(allocDescs, data[allocDescsOffset:allocDescsOffset+int(allocDescsLength)])
}
return &udfFileEntry{
ICBTag: fileType,
InfoLength: infoLength,
AllocDescs: allocDescs,
AllocType: byte(icbFlags & 0x07),
}, nil
}
// parseUDFDirectory parses raw directory data into a list of FIDs.
func parseUDFDirectory(dirData []byte) []udfFID {
var fids []udfFID
offset := 0
for offset+38 <= len(dirData) {
tag := parseDescriptorTag(dirData[offset:])
if tag.TagID != udfTagFID {
break
}
characteristics := dirData[offset+18]
nameLen := int(dirData[offset+19])
icbLoc := parseLongAD(dirData[offset+20 : offset+36])
implUseLen := int(binary.LittleEndian.Uint16(dirData[offset+36 : offset+38]))
nameStart := offset + 38 + implUseLen
if nameStart+nameLen > len(dirData) {
break
}
isParent := characteristics&0x08 != 0
isDir := characteristics&0x02 != 0
name := ""
if nameLen > 0 && !isParent {
nameBytes := dirData[nameStart : nameStart+nameLen]
name = decodeUDFString(nameBytes)
}
fids = append(fids, udfFID{
Name: name,
IsDir: isDir,
IsParent: isParent,
ICBLocation: icbLoc,
})
// FID total length: 38 + implUseLen + nameLen, padded to 4-byte boundary
fidLen := 38 + implUseLen + nameLen
fidLen = (fidLen + 3) &^ 3
offset += fidLen
}
return fids
}
// decodeUDFString decodes a UDF d-string/d-characters identifier.
// UDF uses either 8-bit (compression ID 8) or 16-bit (compression ID 16) encoding.
func decodeUDFString(data []byte) string {
if len(data) == 0 {
return ""
}
compressionID := data[0]
payload := data[1:]
switch compressionID {
case 8:
// 8-bit characters (Latin-1 / ASCII subset)
return string(payload)
case 16:
// 16-bit big-endian Unicode (UCS-2)
var sb strings.Builder
for i := 0; i+1 < len(payload); i += 2 {
ch := rune(payload[i])<<8 | rune(payload[i+1])
sb.WriteRune(ch)
}
return sb.String()
default:
// Unknown compression ID — try as raw bytes
return string(payload)
}
}
package source
import (
"bytes"
"encoding/binary"
)
// FindVideoStartCodes finds all video start code positions (00 00 01 XX pattern) in the data.
// Returns the position of the first 00 in each start code.
// These are potential sync points where video frames or other structures begin.
// Optimized to use bytes.IndexByte for fast scanning (uses SIMD on x86).
func FindVideoStartCodes(data []byte) []int {
if len(data) < 4 {
return nil
}
// Pre-allocate with estimated capacity (roughly 1 start code per 2KB of video data)
offsets := make([]int, 0, len(data)/2048+1)
// Use bytes.IndexByte to quickly find the 0x01 byte (third byte of start code)
// This is faster than checking every byte since IndexByte uses SIMD
i := 2 // Start at position 2 since we need at least 00 00 before 01
for i < len(data)-1 {
// Find next 0x01 byte
idx := bytes.IndexByte(data[i:], 0x01)
if idx < 0 {
break
}
pos := i + idx
// Check if preceded by 00 00
if pos >= 2 && data[pos-1] == 0x00 && data[pos-2] == 0x00 {
offsets = append(offsets, pos-2)
}
// Move past this position
i = pos + 1
}
return offsets
}
// FindVideoStartCodesInRange finds video start codes within a specific range.
// Returns the position of the first 00 in each start code, offset by startOffset.
// Optimized version using bytes.IndexByte for fast scanning.
func FindVideoStartCodesInRange(data []byte, startOffset int) []int {
if len(data) < 4 {
return nil
}
// Pre-allocate with estimated capacity
offsets := make([]int, 0, len(data)/2048+1)
i := 2
for i < len(data)-1 {
idx := bytes.IndexByte(data[i:], 0x01)
if idx < 0 {
break
}
pos := i + idx
if pos >= 2 && data[pos-1] == 0x00 && data[pos-2] == 0x00 {
offsets = append(offsets, startOffset+pos-2)
}
i = pos + 1
}
return offsets
}
// FindVideoNALStarts finds NAL unit start positions in Annex B formatted data.
// Returns positions of NAL header bytes (the byte AFTER 00 00 01).
// This is used for hashing: NAL header + NAL data are identical in both
// Annex B (source) and AVCC (MKV) formats, enabling cross-format matching.
func FindVideoNALStarts(data []byte) []int {
if len(data) < 4 {
return nil
}
offsets := make([]int, 0, len(data)/2048+1)
i := 2
for i < len(data)-1 {
idx := bytes.IndexByte(data[i:], 0x01)
if idx < 0 {
break
}
pos := i + idx
// Check if preceded by 00 00 — start code is at pos-2
// NAL header byte is at pos+1
if pos >= 2 && data[pos-1] == 0x00 && data[pos-2] == 0x00 {
nalStart := pos + 1
if nalStart < len(data) {
offsets = append(offsets, nalStart)
}
}
i = pos + 1
}
return offsets
}
// FindVideoNALStartsInRange finds NAL unit start positions in a specific range.
// Returns positions offset by startOffset for use during chunked file processing.
func FindVideoNALStartsInRange(data []byte, startOffset int) []int {
if len(data) < 4 {
return nil
}
offsets := make([]int, 0, len(data)/2048+1)
i := 2
for i < len(data)-1 {
idx := bytes.IndexByte(data[i:], 0x01)
if idx < 0 {
break
}
pos := i + idx
if pos >= 2 && data[pos-1] == 0x00 && data[pos-2] == 0x00 {
nalStart := pos + 1
if nalStart < len(data) {
offsets = append(offsets, startOffset+nalStart)
}
}
i = pos + 1
}
return offsets
}
// FindAVCCNALStarts finds NAL unit start positions in AVCC/HVCC formatted data.
// In AVCC format, each NAL unit is prefixed with a length field (nalLengthSize bytes,
// big-endian). Returns positions of NAL header bytes (the byte after each length prefix).
// nalLengthSize is typically 4 for H.264 AVCC and H.265 HVCC.
func FindAVCCNALStarts(data []byte, nalLengthSize int) []int {
if nalLengthSize < 1 || nalLengthSize > 4 {
return nil
}
if len(data) < nalLengthSize+1 {
return nil
}
offsets := make([]int, 0, len(data)/2048+1)
pos := 0
for pos+nalLengthSize < len(data) {
// Read NAL unit length
var nalLen uint32
switch nalLengthSize {
case 4:
nalLen = binary.BigEndian.Uint32(data[pos:])
case 3:
nalLen = uint32(data[pos])<<16 | uint32(data[pos+1])<<8 | uint32(data[pos+2])
case 2:
nalLen = uint32(binary.BigEndian.Uint16(data[pos:]))
case 1:
nalLen = uint32(data[pos])
}
nalStart := pos + nalLengthSize
if nalLen == 0 || nalStart >= len(data) {
break
}
offsets = append(offsets, nalStart)
// Move to next NAL unit
next := nalStart + int(nalLen)
if next <= pos {
break // Overflow protection
}
pos = next
}
return offsets
}
// Package testdata provides helpers for locating integration test data.
//
// Test data (Big Buck Bunny DVD ISO and MKV) is not stored in the repository.
// See README.md in this directory for setup instructions.
package testdata
import (
"os"
"os/exec"
"path/filepath"
"runtime"
"testing"
)
// Paths contains the resolved paths to test data files.
type Paths struct {
Root string // Base test data directory
ISODir string // Directory containing ISO file
ISOFile string // Path to the ISO file
MKVDir string // Directory containing MKV file(s)
MKVFile string // Path to the main MKV file
Available bool // True if all required files exist
}
// DefaultISOName is the expected ISO filename.
const DefaultISOName = "bbb-pal.iso"
// DefaultMKVPattern is the glob pattern for finding MKV files.
const DefaultMKVPattern = "*.mkv"
// Find locates the test data directory and checks for required files.
// It checks these locations in order:
// 1. $MKVDUP_TESTDATA environment variable
// 2. testdata/generated/ (relative to the testdata package, created by generate-test-data.sh)
// 3. ~/.cache/mkvdup/testdata/
// 4. /tmp/mkvdup-testdata/
//
// Returns Paths with Available=false if test data is not found.
func Find() Paths {
var p Paths
// Check environment variable first
if envPath := os.Getenv("MKVDUP_TESTDATA"); envPath != "" {
p.Root = envPath
if checkPaths(&p) {
return p
}
}
// Check testdata/generated/ (local to the repo, created by generate-test-data.sh)
// This is the preferred location for reproducible test data
if localPath := findLocalTestdataDir(); localPath != "" {
p.Root = localPath
if checkPaths(&p) {
return p
}
}
// Check ~/.cache/mkvdup/testdata/
if home, err := os.UserHomeDir(); err == nil {
p.Root = filepath.Join(home, ".cache", "mkvdup", "testdata")
if checkPaths(&p) {
return p
}
}
// Check /tmp/mkvdup-testdata/
p.Root = "/tmp/mkvdup-testdata"
if checkPaths(&p) {
return p
}
// Not found - clear all paths
p.Root = ""
p.ISODir = ""
p.ISOFile = ""
p.MKVDir = ""
p.MKVFile = ""
p.Available = false
return p
}
// checkPaths fills in the paths and returns true if all required files exist.
func checkPaths(p *Paths) bool {
p.ISODir = filepath.Join(p.Root, "bigbuckbunny")
p.MKVDir = filepath.Join(p.Root, "bigbuckbunny-mkv")
// Check ISO file
p.ISOFile = filepath.Join(p.ISODir, DefaultISOName)
if _, err := os.Stat(p.ISOFile); err != nil {
// Try NTSC variant
p.ISOFile = filepath.Join(p.ISODir, "bbb-ntsc.iso")
if _, err := os.Stat(p.ISOFile); err != nil {
p.Available = false
return false
}
}
// Find MKV file (first match)
matches, err := filepath.Glob(filepath.Join(p.MKVDir, DefaultMKVPattern))
if err != nil || len(matches) == 0 {
p.Available = false
return false
}
p.MKVFile = matches[0]
p.Available = true
return true
}
// SkipIfNotAvailable calls t.Skip if test data is not available.
// Use this at the start of integration tests.
func SkipIfNotAvailable(t interface{ Skip(...interface{}) }) Paths {
p := Find()
if !p.Available {
t.Skip("Test data not available. See testdata/README.md for setup instructions.")
}
return p
}
// CreateBlurayData creates a Blu-ray directory structure by remuxing the MKV
// file to M2TS format using ffmpeg (copy codec, no re-encoding). The directory
// is created under tmpDir and has the layout BDMV/STREAM/00001.m2ts that
// DetectType recognises as TypeBluray.
//
// The test is skipped if ffmpeg is not available.
func (p Paths) CreateBlurayData(t testing.TB, tmpDir string) string {
t.Helper()
if _, err := exec.LookPath("ffmpeg"); err != nil {
t.Skip("ffmpeg not available, skipping Blu-ray test")
}
blurayRoot := filepath.Join(tmpDir, "bluray")
streamDir := filepath.Join(blurayRoot, "BDMV", "STREAM")
if err := os.MkdirAll(streamDir, 0755); err != nil {
t.Fatalf("CreateBlurayData: mkdir: %v", err)
}
m2tsPath := filepath.Join(streamDir, "00001.m2ts")
cmd := exec.Command("ffmpeg",
"-loglevel", "error",
"-i", p.MKVFile,
"-c", "copy",
"-f", "mpegts",
"-y", // overwrite if exists
m2tsPath,
)
output, err := cmd.CombinedOutput()
if err != nil {
t.Fatalf("CreateBlurayData: ffmpeg remux failed: %v\n%s", err, output)
}
return blurayRoot
}
// findLocalTestdataDir returns the path to testdata/generated/ directory
// relative to this source file, or empty string if it cannot be determined.
func findLocalTestdataDir() string {
_, filename, _, ok := runtime.Caller(0)
if !ok {
return ""
}
// filename is the path to this file (testdata.go)
// We want the "generated" subdirectory in the same directory
dir := filepath.Dir(filename)
return filepath.Join(dir, "generated")
}