Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl -w
- #
- # This parses mp4, wmv, mkv top level chunks, displays offset and length,
- # and if invalid data is added at the end, it can be removed with the -f
- # flag.
- #
- # To use this perl script on windows, strawberryperl from
- # strawberryperl.com
- #
- # perl D:\torrents\videocleaner.pl D:\torrents\clip.mp4
- #
- # If it shows anything invalid at the end of the file, running it again with
- # -f will remove those extra bytes.
- #
- # perl D:\torrents\videocleaner.pl -f D:\torrents\clip.mp4
- #
- use File::Copy;
- use Fcntl qw(:flock SEEK_END);
- my $force = 0;
- my $bytescutoff = 1000;
- # Help
- if ($ARGV[0] eq "-h") {
- print "Usage: $0 [-f] <media filename>\n";
- print " -f splits invalid trailing bytes out into a separate file.\n";
- exit;
- } elsif ($ARGV[0] eq '-f') {
- $force = 1;
- shift @ARGV;
- }
- $cfn = $ARGV[0];
- open(my $fh, "<", $cfn) or die $!;
- binmode($fh);
- $cfs = -s $fh;
- # truncate filename (arg1) of a given size (arg2) by (arg3) bytes.
- # returns nothing
- sub binTrunc {
- my ($cfn, $cfs, $truncbytes) = @_;
- my $newbytes = $cfs - $truncbytes;
- print "truncating $cfn to $newbytes\n";
- # make sure file is readable
- my $perm = (stat $cfn)[2] & 07777;
- chmod($perm | 0600, $cfn) or die "Unable to make file writable, $!\n";
- # truncate
- truncate($cfn, $cfs-$truncbytes) or die "Unable to truncate file, $!\n";
- }
- sub backupTag {
- my ($tagfile, $c4stag) = @_;
- my $tagfn = $tagfile;
- my $eidx = 1;
- # find a name that doesn't exist
- while ( -e $tagfn ) {
- $tagfn = "$tagfile.$eidx";
- $eidx++;
- }
- open(my $fho, ">", $tagfn) or die $!;
- binmode($fho);
- print $fho $c4stag;
- close($fho);
- print "Backed up trailing data to ($tagfn)\n";
- }
- # identify the file type given the first few bytes from the file
- # (needs to be passed at least 8 bytes)
- # returns "mp4" or "wmv" or "mkv"
- sub filetype {
- my ($header) = @_;
- my $ftype;
- # examples
- # ASF "\x30\x26\xb2\x75\x8e\x66\xcf\x11\xa6\xd9\x00\xaa\x00\x62\xce\x6c"
- # MP4 "\x00\x00\x00.ftypisom"
- # Matroska 1a 45 df a3 93 42 82 88 6d 61 74 72 6f 73 6b 61
- # = 00011010b 45 df a3 10010011b (19)...
- #
- # 00000001b 42 85 81 01 18 53 80 (id)
- # = 67 01 00 = 01100111b 00000001b = 9985
- if (($header =~ /\A\x00\x00\x00.ftyp(isom|iso2|qt |mp4[12]|M4V )/) ||
- ($header =~ /\A\x00...moov/)) {
- $ftype = "mp4";
- } elsif (substr($header, 0, 4) eq "\x30\x26\xb2\x75") {
- $ftype = "wmv";
- } elsif (substr($header, 0, 4) eq "\x1a\x45\xdf\xa3") {
- $ftype = "mkv";
- } else {
- $ftype = "unknown";
- }
- return $ftype;
- }
- # takes file type as argument
- sub chunk_min {
- my ($ftype) = @_;
- my $chunkmin;
- if ($ftype eq "mp4") {
- $chunkmin = 8;
- } elsif ($ftype eq "wmv") {
- $chunkmin = 24;
- } elsif ($ftype eq "mkv") {
- $chunkmin = 16;
- } else {
- die "Unknown file header type\n";
- }
- }
- # takes file type and chunkdata as arguments
- sub decode_chunk_type_length {
- my ($ftype, $contfh, $pos, $chunkdata) = @_;
- my $prettypos = sprintf("%10u", $pos);
- my ($chunklength, $chunktype);
- my $chunkextra = 0;
- if ($ftype eq "wmv") {
- # ASF uses little-endian for number storage
- ($chunktype, $chunklength) = unpack('(H32Q)<', $chunkdata);
- } elsif ($ftype eq "mp4") {
- # ISO mpeg uses big-endian for number storage
- ($chunklength, $chunktype) = unpack('(LA4)>', $chunkdata);
- if ($chunklength == 1) {
- # 32 bit size 1 in mp4 means 64 bit size after the chunk type
- my $chunk64size;
- read($contfh, $chunk64size, 8) or die $!;
- ($chunklength) = unpack('(Q)>', $chunk64size);
- $chunkextra += 8;
- }
- } elsif ($ftype eq "mkv") {
- my ($vint1len, $vint2len);
- # unpack mkv chunk id
- ($vint1len, $chunktype) = &ebml_size_unpack($chunkdata);
- if ($vint1len > 0) {
- # unpack mkv chunk size
- ($vint2len, $chunklength) = &ebml_size_unpack(substr($chunkdata, $vint1len, -1));
- }
- if ($vint1len < 1 || $vint2len < 1) {
- print "corrupt mkv block (invalid block size)\n";
- $chunklength = 0;
- $chunktype = "----";
- } else {
- $chunklength += $vint2len + $vint1len;
- }
- $chunktype = sprintf("%x", $chunktype);
- } else {
- die "Unknown file type\n";
- }
- printf("offset %11i length %11i (type %s)\n", $prettypos, $chunklength, $chunktype);
- return ($chunktype, $chunklength, $chunkextra);
- }
- sub is_chunk_size_ok {
- my ($ftype, $chunkmin, $chunklength, $remainingbytes) = @_;
- if ($chunklength > $remainingbytes) {
- return 0;
- }
- if ($ftype eq 'mkv') {
- # is 0 if there's a problem with the chunk
- return $chunklength >= 2;
- } else {
- return $chunklength >= $chunkmin;
- }
- }
- # variable length integer unpacker for matroska (mkv) files only.
- # takes binary data/string with a mkv chunk type and variable length integer
- # returns a list (chunk type, chunk size)
- sub ebml_size_unpack {
- my ($chunkdata) = @_;
- # encoded chunk size does not include elementid and chunk size
- # specs say big endian
- my $rawbytei = ord(substr($chunkdata, 0, 1));
- my $vibytes;
- if ($rawbytei == 0) {
- $vibytes = -1;
- } else {
- $vibytes = 8 - int(log($rawbytei)/log(2));
- }
- # check for missing set bit, or not enough header bytes
- return (-1, -1) if ( ($vibytes == -1) || ($vibytes > length($chunkdata)) );
- my $accu = (0xff >> $vibytes) & $rawbytei;
- my $i = 1;
- while ($i < $vibytes) {
- $accu = $accu * 256 + ord(substr($chunkdata, $i, 1));
- $i++;
- }
- #print "returning $vibytes, $accu\n";
- return ($vibytes, $accu);
- }
- # Parses the file according to its
- # file type, looking for small broken chunks near the end.
- #
- # Returns the number of tag/junk bytes found at the end of the file, up to the number
- # set in the second parameter for safety, so it doesn't truncate half of a file
- sub findtagbytes {
- # ISO media file top level structure
- # Series of chunks, 4 byte size (BE), 4 byte chunk name, and data
- # ASF v1 media file top level structure
- # Series of chunks, 16 byte guid, 8 byte chunk length (LE), and data
- my ($contname, $bytescutoff) = @_;
- open(my $contfh, "<", $contname) or die $!;
- binmode($contfh);
- # detect file type
- read($contfh, my $header, 16);
- my $ftype = &filetype($header);
- my $chunkmin = &chunk_min($ftype);
- my $contsize = -s $contfh;
- printf("File type %s, size (%12i)\n", $ftype, $contsize);
- my $pos = 0;
- # reset file position
- seek($contfh, $pos, 0); # 0 is SEEK_SET in fcntl module
- while ($pos < $contsize) {
- my $remainingbytes = $contsize - $pos;
- # failsafe position check, in case code is modified and file seeking doesn't
- # work correctly, avoid corrupting files
- my $testpos = tell($contfh);
- if ($testpos != $pos) {
- die "Mismatch between real pos $testpos and assumed pos $pos\n";
- }
- # If not enough room for a valid chunk, print warning and abort
- if ($remainingbytes < $chunkmin) {
- print "small junk section at end ($remainingbytes bytes)\n";
- close($contfh);
- return $remainingbytes;
- }
- # read the chunk header
- my $chunkdata;
- read($contfh, $chunkdata, $chunkmin) or die $!;
- # decode and print chunk header info
- my ($chunktype, $chunklength, $chunkextra) = &decode_chunk_type_length($ftype, $contfh, $pos, $chunkdata);
- $bytes_after = $remainingbytes - $chunklength;
- printf(" (%12i) bytes remaining", $bytes_after);
- print "\n";
- # check for problems
- if (! &is_chunk_size_ok($ftype, $chunkmin + $chunkextra, $chunklength, $remainingbytes)) {
- if ($remainingbytes > $bytescutoff) {
- # if there are too many remaining bytes, do nothing, to avoid corruption
- print "CAUTION: invalid chunk with >$bytescutoff ($remainingbytes) bytes left in file\n" .
- "corruption probably unrelated to trailing junk bytes... not modifying\n";
- # set to zero to avoid truncate
- $remainingbytes = 0;
- } else {
- print " Junk detected at end of file\n";
- }
- close($contfh);
- return $remainingbytes;
- }
- last if ($chunklength == 0);
- $pos += $chunklength;
- seek($contfh, $pos, 0); # 0 is SEEK_SET in module fcntl
- }
- close($contfh);
- return 0;
- }
- # This section looks for junk or broken top layer chunks
- # In addition to what some clip sites add
- # Some files uploaded to file locker sites have a few added null characters
- my $tagbytes = findtagbytes($cfn, $bytescutoff);
- print "\n";
- if ($tagbytes > 0) {
- # findtagbytes() found between 1 and $bytescutoff junk bytes at the end of the file
- # not belonging to a valid media file chunk, so remove it
- #
- # Read the trailing bytes into a variable for backup purposes.
- seek($fh, -$tagbytes, SEEK_END);
- read($fh, $c4stag, $tagbytes);
- die "Run with '-f' flag to actually truncate the file." unless $force;
- # Back up the trailing data and remove it by truncating the media file
- &backupTag($cfn . ".tag", $c4stag);
- &binTrunc($cfn, $cfs, $tagbytes);
- print "cleaned $cfn\n";
- } else {
- print "No trailing junk data detected (limited to 1-$bytescutoff bytes)\n";
- }
- close($fh);
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement