Advertisement
Guest User

videocleaner.pl

a guest
Dec 18th, 2018
1,915
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 9.77 KB | None | 0 0
  1. #!/usr/bin/perl -w
  2. #
  3. # This parses mp4, wmv, mkv top level chunks, displays offset and length,
  4. # and if invalid data is added at the end, it can be removed with the -f
  5. # flag.
  6. #
  7. # To use this perl script on windows, strawberryperl from
  8. # strawberryperl.com
  9. #
  10. # perl D:\torrents\videocleaner.pl D:\torrents\clip.mp4
  11. #
  12. # If it shows anything invalid at the end of the file, running it again with
  13. # -f will remove those extra bytes.
  14. #
  15. # perl D:\torrents\videocleaner.pl -f D:\torrents\clip.mp4
  16. #
  17.  
  18. use File::Copy;
  19. use Fcntl qw(:flock SEEK_END);
  20.  
  21. my $force = 0;
  22. my $bytescutoff = 1000;
  23.  
  24.  
  25. # Help
  26. if ($ARGV[0] eq "-h") {
  27.     print "Usage: $0 [-f] <media filename>\n";
  28.     print "  -f splits invalid trailing bytes out into a separate file.\n";
  29.     exit;
  30. } elsif ($ARGV[0] eq '-f') {
  31.     $force = 1;
  32.     shift @ARGV;
  33. }
  34. $cfn = $ARGV[0];
  35.  
  36.  
  37. open(my $fh, "<", $cfn) or die $!;
  38. binmode($fh);
  39. $cfs = -s $fh;
  40.  
  41. # truncate filename (arg1) of a given size (arg2) by (arg3) bytes.
  42. # returns nothing
  43. sub binTrunc {
  44.     my ($cfn, $cfs, $truncbytes) = @_;
  45.  
  46.     my $newbytes = $cfs - $truncbytes;
  47.     print "truncating $cfn to $newbytes\n";
  48.  
  49.     # make sure file is readable
  50.     my $perm = (stat $cfn)[2] & 07777;
  51.     chmod($perm | 0600, $cfn) or die "Unable to make file writable, $!\n";
  52.     # truncate
  53.     truncate($cfn, $cfs-$truncbytes) or die "Unable to truncate file, $!\n";
  54. }
  55.  
  56.  
  57. sub backupTag {
  58.     my ($tagfile, $c4stag) = @_;
  59.  
  60.     my $tagfn = $tagfile;
  61.     my $eidx = 1;
  62.  
  63.     # find a name that doesn't exist
  64.     while ( -e $tagfn ) {
  65.         $tagfn = "$tagfile.$eidx";
  66.         $eidx++;
  67.     }
  68.  
  69.     open(my $fho, ">", $tagfn) or die $!;
  70.     binmode($fho);
  71.     print $fho $c4stag;
  72.     close($fho);
  73.  
  74.     print "Backed up trailing data to ($tagfn)\n";
  75. }
  76.  
  77. # identify the file type given the first few bytes from the file
  78. # (needs to be passed at least 8 bytes)
  79. # returns "mp4" or "wmv" or "mkv"
  80. sub filetype {
  81.     my ($header) = @_;
  82.     my $ftype;
  83.  
  84.     # examples
  85.     # ASF "\x30\x26\xb2\x75\x8e\x66\xcf\x11\xa6\xd9\x00\xaa\x00\x62\xce\x6c"
  86.     # MP4 "\x00\x00\x00.ftypisom"
  87.     # Matroska  1a 45 df a3 93 42 82 88 6d 61 74 72 6f 73 6b 61
  88.     #   = 00011010b 45 df a3 10010011b (19)...
  89.     #
  90.     #   00000001b 42 85 81 01 18 53 80 (id)
  91.     #   = 67 01 00 = 01100111b 00000001b = 9985
  92.  
  93.     if (($header =~ /\A\x00\x00\x00.ftyp(isom|iso2|qt  |mp4[12]|M4V )/) ||
  94.         ($header =~ /\A\x00...moov/)) {
  95.         $ftype = "mp4";
  96.     } elsif (substr($header, 0, 4) eq "\x30\x26\xb2\x75") {
  97.         $ftype = "wmv";
  98.     } elsif (substr($header, 0, 4) eq "\x1a\x45\xdf\xa3") {
  99.         $ftype = "mkv";
  100.     } else {
  101.         $ftype = "unknown";
  102.     }
  103.     return $ftype;
  104. }
  105.  
  106.  
  107. # takes file type as argument
  108. sub chunk_min {
  109.     my ($ftype) = @_;
  110.     my $chunkmin;
  111.  
  112.     if ($ftype eq "mp4") {
  113.         $chunkmin = 8;
  114.     } elsif ($ftype eq "wmv") {
  115.         $chunkmin = 24;
  116.     } elsif ($ftype eq "mkv") {
  117.         $chunkmin = 16;
  118.     } else {
  119.         die "Unknown file header type\n";
  120.     }
  121. }
  122.  
  123. # takes file type and chunkdata as arguments
  124. sub decode_chunk_type_length {
  125.     my ($ftype, $contfh, $pos, $chunkdata) = @_;
  126.  
  127.     my $prettypos = sprintf("%10u", $pos);
  128.     my ($chunklength, $chunktype);
  129.     my $chunkextra = 0;
  130.  
  131.     if ($ftype eq "wmv") {
  132.         # ASF uses little-endian for number storage
  133.         ($chunktype, $chunklength) = unpack('(H32Q)<', $chunkdata);
  134.     } elsif ($ftype eq "mp4") {
  135.         # ISO mpeg uses big-endian for number storage
  136.         ($chunklength, $chunktype) = unpack('(LA4)>', $chunkdata);
  137.         if ($chunklength == 1) {
  138.             # 32 bit size 1 in mp4 means 64 bit size after the chunk type
  139.             my $chunk64size;
  140.             read($contfh, $chunk64size, 8) or die $!;
  141.             ($chunklength) = unpack('(Q)>', $chunk64size);
  142.             $chunkextra += 8;
  143.         }
  144.     } elsif ($ftype eq "mkv") {
  145.         my ($vint1len, $vint2len);
  146.         # unpack mkv chunk id
  147.         ($vint1len, $chunktype) = &ebml_size_unpack($chunkdata);
  148.         if ($vint1len > 0) {
  149.             # unpack mkv chunk size
  150.             ($vint2len, $chunklength) = &ebml_size_unpack(substr($chunkdata, $vint1len, -1));
  151.         }
  152.         if ($vint1len < 1 || $vint2len < 1) {
  153.             print "corrupt mkv block (invalid block size)\n";
  154.             $chunklength = 0;
  155.             $chunktype = "----";
  156.         } else {
  157.             $chunklength += $vint2len + $vint1len;
  158.         }
  159.         $chunktype = sprintf("%x", $chunktype);
  160.     } else {
  161.         die "Unknown file type\n";
  162.     }
  163.  
  164.     printf("offset %11i length %11i  (type %s)\n", $prettypos, $chunklength, $chunktype);
  165.     return ($chunktype, $chunklength, $chunkextra);
  166. }
  167.  
  168.  
  169. sub is_chunk_size_ok {
  170.     my ($ftype, $chunkmin, $chunklength, $remainingbytes) = @_;
  171.  
  172.     if ($chunklength > $remainingbytes) {
  173.         return 0;
  174.     }
  175.  
  176.     if ($ftype eq 'mkv') {
  177.         # is 0 if there's a problem with the chunk
  178.         return $chunklength >= 2;
  179.     } else {
  180.         return $chunklength >= $chunkmin;
  181.     }
  182. }
  183.  
  184. # variable length integer unpacker for matroska (mkv) files only.
  185. # takes binary data/string with a mkv chunk type and variable length integer
  186. # returns a list (chunk type, chunk size)
  187. sub ebml_size_unpack {
  188.     my ($chunkdata) = @_;
  189.  
  190.     # encoded chunk size does not include elementid and chunk size
  191.     # specs say big endian
  192.  
  193.     my $rawbytei = ord(substr($chunkdata, 0, 1));
  194.     my $vibytes;
  195.  
  196.     if ($rawbytei == 0) {
  197.         $vibytes = -1;
  198.     } else {
  199.         $vibytes = 8 - int(log($rawbytei)/log(2));
  200.     }
  201.  
  202.     # check for missing set bit, or not enough header bytes
  203.     return (-1, -1) if ( ($vibytes == -1) || ($vibytes > length($chunkdata)) );
  204.  
  205.     my $accu = (0xff >> $vibytes) & $rawbytei;
  206.  
  207.     my $i = 1;
  208.     while ($i < $vibytes) {
  209.         $accu = $accu * 256 + ord(substr($chunkdata, $i, 1));
  210.         $i++;
  211.     }
  212.     #print "returning $vibytes, $accu\n";
  213.     return ($vibytes, $accu);
  214. }
  215.  
  216.  
  217. # Parses the file according to its
  218. # file type, looking for small broken chunks near the end.
  219. #
  220. # Returns the number of tag/junk bytes found at the end of the file, up to the number
  221. # set in the second parameter for safety, so it doesn't truncate half of a file
  222. sub findtagbytes {
  223.     # ISO media file top level structure
  224.     # Series of chunks, 4 byte size (BE), 4 byte chunk name, and data
  225.  
  226.     # ASF v1 media file top level structure
  227.     # Series of chunks, 16 byte guid, 8 byte chunk length (LE), and data
  228.  
  229.     my ($contname, $bytescutoff) = @_;
  230.  
  231.     open(my $contfh, "<", $contname) or die $!;
  232.     binmode($contfh);
  233.  
  234.     # detect file type
  235.     read($contfh, my $header, 16);
  236.     my $ftype = &filetype($header);
  237.     my $chunkmin = &chunk_min($ftype);
  238.  
  239.     my $contsize = -s $contfh;
  240.  
  241.     printf("File type %s, size     (%12i)\n", $ftype, $contsize);
  242.  
  243.     my $pos = 0;
  244.     # reset file position
  245.     seek($contfh, $pos, 0);  # 0 is SEEK_SET in fcntl module
  246.     while ($pos < $contsize) {
  247.         my $remainingbytes = $contsize - $pos;
  248.  
  249.         # failsafe position check, in case code is modified and file seeking doesn't
  250.         # work correctly, avoid corrupting files
  251.         my $testpos = tell($contfh);
  252.         if ($testpos != $pos) {
  253.             die "Mismatch between real pos $testpos and assumed pos $pos\n";
  254.         }
  255.  
  256.         # If not enough room for a valid chunk, print warning and abort
  257.         if ($remainingbytes < $chunkmin) {
  258.             print "small junk section at end ($remainingbytes bytes)\n";
  259.             close($contfh);
  260.             return $remainingbytes;
  261.         }
  262.  
  263.         # read the chunk header
  264.         my $chunkdata;
  265.         read($contfh, $chunkdata, $chunkmin) or die $!;
  266.  
  267.         # decode and print chunk header info
  268.         my ($chunktype, $chunklength, $chunkextra) = &decode_chunk_type_length($ftype, $contfh, $pos, $chunkdata);
  269.  
  270.         $bytes_after = $remainingbytes - $chunklength;
  271.         printf("                        (%12i) bytes remaining", $bytes_after);
  272.         print "\n";
  273.  
  274.         # check for problems
  275.         if (! &is_chunk_size_ok($ftype, $chunkmin + $chunkextra, $chunklength, $remainingbytes)) {
  276.             if ($remainingbytes > $bytescutoff) {
  277.                 # if there are too many remaining bytes, do nothing, to avoid corruption
  278.                 print "CAUTION: invalid chunk with >$bytescutoff ($remainingbytes) bytes left in file\n" .
  279.                     "corruption probably unrelated to trailing junk bytes... not modifying\n";
  280.                 # set to zero to avoid truncate
  281.                 $remainingbytes = 0;
  282.             } else {
  283.                 print "  Junk detected at end of file\n";
  284.             }
  285.  
  286.             close($contfh);
  287.             return $remainingbytes;
  288.         }
  289.  
  290.         last if ($chunklength == 0);
  291.  
  292.         $pos += $chunklength;
  293.         seek($contfh, $pos, 0); # 0 is SEEK_SET in module fcntl
  294.     }
  295.     close($contfh);
  296.     return 0;
  297. }
  298.  
  299.  
  300. # This section looks for junk or broken top layer chunks
  301. # In addition to what some clip sites add
  302. # Some files uploaded to file locker sites have a few added null characters
  303.  
  304. my $tagbytes = findtagbytes($cfn, $bytescutoff);
  305.  
  306. print "\n";
  307.  
  308. if ($tagbytes > 0) {
  309.     # findtagbytes() found between 1 and $bytescutoff junk bytes at the end of the file
  310.     # not belonging to a valid media file chunk, so remove it
  311.     #
  312.     # Read the trailing bytes into a variable for backup purposes.
  313.     seek($fh, -$tagbytes, SEEK_END);
  314.     read($fh, $c4stag, $tagbytes);
  315.  
  316.     die "Run with '-f' flag to actually truncate the file." unless $force;
  317.  
  318.     # Back up the trailing data and remove it by truncating the media file
  319.     &backupTag($cfn . ".tag", $c4stag);
  320.     &binTrunc($cfn, $cfs, $tagbytes);
  321.     print "cleaned $cfn\n";
  322. } else {
  323.     print "No trailing junk data detected (limited to 1-$bytescutoff bytes)\n";
  324. }
  325.  
  326. close($fh);
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement