Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl
- #
- # wyd.pl by Max Moser and Martin J. Muench
- #
- # [ Licence ]
- #
- # This program is free software; you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or
- # any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program; if not, write to the Free Software
- # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- #
- # See 'docs/gpl.txt' for more information.
- use strict;
- use FileHandle;
- use File::Find;
- use File::Basename;
- use Getopt::Long;
- my $version = "0.2"; # version
- my @listoffiles; # The list of files to process
- my $fileprog = undef; # scalar that is filled with 'file' program
- # Module hash containing module name and supported file extensions
- # Multiple extensions are seperated using ';'
- my %wlgmods = (
- 'wlgmod::strings', '', # only used with command-line switch
- 'wlgmod::plain' , '.txt', # used for all MIME text/plain as well
- 'wlgmod::html' , '.html;.htm;.php;.php3;.php4',
- 'wlgmod::doc' , '.doc',
- 'wlgmod::pdf' , '.pdf',
- 'wlgmod::mp3' , '.mp3',
- 'wlgmod::ppt' , '.ppt',
- 'wlgmod::jpeg' , '.jpeg;.jpg;.JPG;.JPEG',
- 'wlgmod::odt' , '.odt;.ods;.odp'
- );
- # Hash that will be filled dynamically with filehandles (if -t is used)
- my %file_handle = ();
- #### Begin main ####
- # Print Header
- print STDERR "\n*\n* $0 $version by Max Moser and Martin J. Muench\n*\n\n";
- # Check command line options
- my %opts;
- my $strings_check = undef;
- my $output_file = undef;
- my $separate_types = undef;
- my $no_filenames = undef;
- my $debug = undef; # set to "1" for debugprints -v will do this on command line
- my $prefixclean = undef;
- my $postfixclean = undef;
- my $no_missingask = undef;
- # Parse command line
- &usage if !GetOptions ('s=i' => \$strings_check,
- 'o=s' => \$output_file,
- 'v+' => \$debug,
- 'e+' => \$postfixclean,
- 'b+' => \$prefixclean ,
- 't+' => \$separate_types,
- 'f+' => \$no_filenames,
- 'n+' => \$no_missingask);
- # -t used without -o
- &usage if($separate_types && !$output_file);
- # No file(s)/dir(s) given
- &usage if($#ARGV < 0);
- # Add given file(s)/directories to array
- for(my $i = 0 ; $i <= $#ARGV ; $i++) {
- # File/Dir does not exist
- if ( ! -e $ARGV[$i]) {
- die "\nError, $ARGV[0] does not exist.\n\n";
- }
- # Directory given
- elsif ( -d $ARGV[$i])
- {
- # Its a directory so we first generate a list of all files with names
- print "\n Its a directory \n" if $debug;
- $ARGV[$i] = qw(.) unless $ARGV[$i];
- find sub {
- push @listoffiles, $File::Find::name if -f
- }, $ARGV[$i];
- }
- # Single File
- elsif (-f $ARGV[$i]) {
- push @listoffiles, $ARGV[$i];
- }
- else {
- die "\n* Error: $ARGV[$i] is not a directory and not a regular file.\n* Sorry, for now this is unsupported.\n\n";
- }
- }
- print "\n\nThats the list to process: @listoffiles\n\n" if $debug;
- # Initialize modules
- if (!&check_n_init) {
- die "\n* Processing aborted\n\n";
- }
- # Open outputfile if requested
- if($output_file && !$separate_types) {
- open(OUTPUT, ">$output_file") || die "\n* Cannot open output file: $!\n";
- }
- # Create output files for all types if requested
- elsif($output_file && $separate_types) {
- foreach (keys %wlgmods) {
- $_ =~ s/wlgmod:://;
- my $fh = new FileHandle "$output_file.$_", "w";
- if(!$fh) {
- die "\n* Cannot create $output_file.$_: $!\n";
- }
- $file_handle{$_} = $fh;
- }
- }
- # We progress now with processing the files and produce the output
- foreach my $singlefile (@listoffiles)
- {
- # Get words using modules
- my ($type, @words) = get_words($singlefile);
- # Print to given output (STDOUT || file)
- my $numentries = @words;
- if($numentries > 0) {
- print "---- Words in $singlefile -----\n\n" if $debug;
- foreach my $wort (unique (\@words))
- {
- # Write to single file
- if($output_file && !$separate_types) {
- print OUTPUT "$wort\n";
- }
- # Write to type-specific output files
- elsif($output_file && $separate_types) {
- foreach(keys %file_handle) {
- if($_ eq $type) {
- my $fh = $file_handle{$_};
- print $fh "$wort\n";
- }
- }
- }
- else {
- print "$wort\n";
- }
- }
- print "\n----- $singlefile -----\n" if $debug;
- }
- }
- # single out
- if($output_file && !$separate_types) {
- close(OUTPUT);
- }
- # single file for each type
- elsif($output_file && $separate_types) {
- foreach(keys %file_handle) {
- my $fh = $file_handle{$_};
- close($fh);
- # remove empty files
- my $file = "$output_file.$_";
- unlink $file if -z $file;
- }
- }
- print STDERR "\n** Done\n\n";
- exit(0);
- #### End of main ####
- # Load needed plugin and extract words
- sub get_words {
- my ($file) = @_;
- my $found = 0;
- my @words = undef;
- my $type = undef;
- my $file_name = undef;
- my $file_dir = undef;
- my $file_ext = undef;
- ($file_name, $file_dir, $file_ext) = fileparse($file,'\..*');
- # Look for matching module and get words
- foreach(keys %wlgmods) {
- my @ext = split(";", $wlgmods{$_});
- foreach my $extension (@ext) {
- if($file_ext eq $extension) {
- $type = $_;
- @words = $_->get_words($file);
- $found=1;
- last;
- }
- }
- }
- # If no module is found, do further checks
- if(!$found) {
- # Check MIME type, if ascii try plain-text module
- open(FILE, "$fileprog -bi \"$file\"|") || die "Cannot execute file: $!\n";
- my $type = <FILE>;
- close(FILE);
- if($type =~ m/^text\/plain/) {
- print "'file' MIME check returned text/plain\n" if $debug;
- $type = "wlgmod::plain";
- @words = wlgmod::plain->get_words($file);
- }
- # Use strings module
- elsif($strings_check) {
- # Check if strings module available
- foreach(keys %wlgmods) {
- if($_ eq "wlgmod::strings") {
- $type = "wlgmod::strings";
- @words = wlgmod::strings->get_words($file,$strings_check);
- }
- }
- }
- # Give up and ignore file
- else {
- print STDERR "Ignoring file '$file'\n";
- return (undef, undef);
- }
- }
- # Add filename itself to wordlist (without path/extension)
- if(!$no_filenames) {
- push @words, $file_name;
- }
- # Remove brackets quotes etc.
- my @Cleanedwords;
- foreach (@words) {
- s/^\W*(.*)/$1/ unless $prefixclean;
- s/^(.*)\W+$/$1/ unless $postfixclean;
- push @Cleanedwords,$_;
- }
- # Cleanup type for high-level func
- $type =~ s/wlgmod:://;
- return ($type, @Cleanedwords);
- } # End sub getwords
- # Check modules for availability and init or remove them
- sub check_n_init {
- my $retvals = undef;
- # Check for 'file'
- open(FILE, "which file|");
- chomp($fileprog = <FILE>);
- close(FILE);
- if($?) {
- $fileprog = undef;
- $retvals .= "file: Cannot locate 'file', skipping MIME type check on unknown files";
- }
- # Initialize possible modules
- foreach(keys %wlgmods) {
- eval("use $_;");
- my $ret = $_->init();
- # If module failed, add errortext and remove from hash
- if($ret) {
- $retvals .= "$_: $ret\n";
- delete $wlgmods{$_};
- $ret = "";
- }
- }
- # If one or more modules failed, let user decide what to do
- if($retvals) {
- print STDERR "\n* Error initializing some modules:\n\n$retvals\n";
- # prompt user what to do if not disabled
- if(!$no_missingask) {
- print STDERR "* Press enter to disable them and continue or STRG+C to abort\n";
- <STDIN>;
- }
- }
- return 1;
- }
- # Make resulting list entries unique
- sub unique {
- my $reflist = shift;
- my @uniq = undef;
- my %seen = ();
- @uniq = grep { ! $seen{$_} ++ } @$reflist;
- return @uniq;
- }
- # print usage and exit
- sub usage {
- print qq~Usage: $0 [OPTIONS] <file(s)|directory>
- Options:
- -o <file> = Write wordlist to <file>
- -t = Separate wordlist files by type, e.g. '<file>.doc'
- -s <min-len> = Use 'strings' for unsupported files
- -b = Disable removal of non-alpha chars at beginning of word
- -e = Disable removal of non-alpha chars at end of word
- -f = Disable inclusion of filenames in wordlist
- -v = Show debug / verbose output
- -n = Continue even if programs / modules are missing
- ~;
- exit(1);
- }
- #### EOF #####
Add Comment
Please, Sign In to add comment