Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/bin/env perl
- #
- # This is a simple script that "greps" an XML based on
- # the names of the xml elements. It prints the contents
- # of the Text data in that element
- #
- # Author: Hector Rivas
- #
- use XML::Parser;
- use Getopt::Std;
- my %Options;
- my $print_content; # Print or not this line
- my $first=1; # If this is the first line or not
- @parent = (); # Stores the name of the parent groups
- @element_content = (); # Stores the content of current element
- @element_subcontent = (""); # Stores the content of the childs
- # initialize the parser
- my $parser = XML::Parser->new( Handlers => {
- Start=>\&handle_start,
- End=>\&handle_end,
- Char=>\&handle_char,
- });
- # Parse the options
- if (not getopts('q1etp', \%Options)) {
- print_help();
- exit 1;
- }
- # Get the file name
- $filename = shift @ARGV;
- if (not $filename) {
- print_help();
- exit 0;
- }
- # Elements to query. We create a hash for this.
- my %element_filter;
- @element_filter{@ARGV} = ();
- $parser->parsefile($filename);
- # Print the acumulated content
- print $element_subcontent[0];
- sub print_help() {
- print <<HelpText;
- Usage: grepxml [options] input.xml [Element1 ...]
- This program greps an XML printing the node names and the Text data.
- Options:
- -1 Print the first node searched and exit.
- -q Print the data, not the node names.
- -e Print also elements with empty content
- -p Print all the parents for eache elements (root.sub1.sub2.element val)
- HelpText
- }
- # Perl trim function to remove whitespace from the start and end of the string
- sub trim($)
- {
- my $string = shift;
- $string =~ s/^\s+//;
- $string =~ s/\s+$//;
- return $string;
- }
- # For each element
- sub handle_start {
- my( $expat, $element, %attrs ) = @_;
- # Add the initial content "" and subcontent
- unshift(@element_content, "");
- unshift(@element_subcontent, "");
- unshift(@parent, $element);
- }
- # Process the Text nodes
- sub handle_char {
- my( $expat, $content ) = @_;
- # Append the content to the last element
- $element_content[0] = $element_content[0] . $content;
- }
- # At the end
- sub handle_end {
- my( $expat, $element, %attrs ) = @_;
- # Get all the path
- my $element_path = join(".", reverse(@parent)); shift @parent;
- # If we have to filter check if it is in the hash
- my $print_content=1 if (keys( %element_filter ) == 0 or
- (exists $element_filter{$element} or exists $element_filter{$element_path}));
- # Get the acumulated content
- my $content = trim(shift @element_content);
- # And the subelement content
- my $subcontent=shift @element_subcontent;
- # The new content
- my $new_content="";
- # If the Text is not empty (or enabled print empty contents) and we have to print this element
- if (($content or $Options{'e'}) and $print_content) {
- # Get the content of the parent:
- if ($Options{'t'}) {
- $new_content .= " " x ($#element_content+1);
- }
- if (not $Options{'q'}) {
- if ($Options{'p'}) {
- $new_content .= "$element_path ";
- } else {
- $new_content .= "$element ";
- }
- }
- $new_content .= "$content\n";
- # If option -1 is set and we are filtering for this element, exit.
- if ($Options{'1'} and
- (exists $element_filter{$element} or exists $element_filter{$element_path})) {
- print $new_content;
- exit 0
- }
- }
- $element_subcontent[0].=$new_content.$subcontent;
- }
Add Comment
Please, Sign In to add comment