Untitled

#! /opt/local/bin/ruby1.9 -w
# -*- coding: utf-8 -*-

# get_tag_and_class.rb: extract tag and class names from a HTML document.

regex = Regexp.compile('<(\w+)[^<>]*class=[\'\"]([^\'\"]+)[\'\"][^<>]*>')

result = Array.new
Tuple = Struct.new(:line_number, :tag, :class)

line_number = 0
STDIN.each { |line|
  line_number += 1
  offset = 0
  while offset < (line.length - 1)
    md = regex.match(line, offset)
    if md
      result.push Tuple.new(line_number, md[1], md[2])
      offset = md.offset(0)[1] + 1
    else
      break
    end
  end
}

result.each { |t|
  STDOUT.puts "#{t.line_number}: #{t.tag}.#{t.class}"
}