Untitled

#!/bin/env perl
#
# This is a simple script that "greps" an XML based on
# the names of the xml elements. It prints the contents
# of the Text data in that element
#
# Author: Hector Rivas
#
use XML::Parser;
use Getopt::Std;

my %Options;
my $print_content; # Print or not this line
my $first=1; # If this is the first line or not
@parent = (); # Stores the name of the parent groups
@element_content = (); # Stores the content of current element
@element_subcontent = (""); # Stores the content of the childs

# initialize the parser
my $parser = XML::Parser->new( Handlers => {
                                Start=>\&handle_start,
                                End=>\&handle_end,
                                Char=>\&handle_char,
                                });

# Parse the options
if (not getopts('q1etp', \%Options)) {
    print_help();
    exit 1;
}
# Get the file name
$filename = shift @ARGV;
if (not $filename) {
    print_help();
    exit 0;
}

# Elements to query. We create a hash for this.
my %element_filter;
@element_filter{@ARGV} = ();
$parser->parsefile($filename);
# Print the acumulated content
print $element_subcontent[0];

sub print_help() {
    print <<HelpText;
Usage: grepxml [options] input.xml [Element1 ...]

This program greps an XML printing the node names and the Text data.
Options:
    -1  Print the first node searched and exit.
    -q  Print the data, not the node names.
    -e  Print also elements with empty content
    -p  Print all the parents for eache elements (root.sub1.sub2.element val)

HelpText
}

# Perl trim function to remove whitespace from the start and end of the string
sub trim($)
{
    my $string = shift;
    $string =~ s/^\s+//;
    $string =~ s/\s+$//;
    return $string;
}

# For each element
sub handle_start {
    my( $expat, $element, %attrs ) = @_;

    # Add the initial content "" and subcontent
    unshift(@element_content, "");
    unshift(@element_subcontent, "");
    unshift(@parent, $element);
}

# Process the Text nodes
sub handle_char {
    my( $expat, $content ) = @_;

    # Append the content to the last element
    $element_content[0] = $element_content[0] . $content;
}


# At the end
sub handle_end {
    my( $expat, $element, %attrs ) = @_;

    # Get all the path
    my $element_path = join(".", reverse(@parent)); shift @parent;

    # If we have to filter check if it is in the hash
    my $print_content=1 if (keys( %element_filter ) == 0 or
        (exists $element_filter{$element} or exists $element_filter{$element_path}));

    # Get the acumulated content
    my $content = trim(shift @element_content);
    # And the subelement content
    my $subcontent=shift @element_subcontent;

    # The new content
    my $new_content="";

    # If the Text is not empty (or enabled print empty contents) and we have to print this element
    if (($content or $Options{'e'}) and $print_content) {
        # Get the content of the parent:

        if ($Options{'t'}) {
            $new_content .= " " x ($#element_content+1);
        }
        if (not $Options{'q'}) {
            if ($Options{'p'}) {
                $new_content .=  "$element_path ";
            } else {
                $new_content .= "$element ";
            }
        }
        $new_content .= "$content\n";
        # If option -1 is set and we are filtering for this element, exit.
        if ($Options{'1'} and
            (exists $element_filter{$element} or exists $element_filter{$element_path})) {
            print $new_content;
            exit 0
        }
    }
    $element_subcontent[0].=$new_content.$subcontent;
}