Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import json
- import sys
- from collections import defaultdict
- class Entity(object):
- """
- Represents the search entity in the "test" selectors.
- Houses matches_dom and get_entity from a given defaultdict factory method
- """
- def __init__(self, tag=None, classes=None, identifier=None):
- self.tag = tag
- self.classes = None if not classes else set(classes) # assumes classes will be passed as a list or None
- self.identifier = identifier
- @staticmethod
- def matches_dom(entity, dom: dict):
- """
- Compares DOM against the given entity and return trues if the dom can match the search entity
- :param entity: the search entity containing tag, classes and identifier
- :param dom: the current DOM element we are comparing with
- :return: if the current DOM element is a match for the entity
- """
- assert(dom.get('tag') is not None)
- is_tag_match, is_id_match, are_classes_match = True, True, True
- if entity.tag is not None:
- is_tag_match = entity.tag == dom['tag']
- if entity.identifier is not None:
- is_id_match = entity.identifier == dom.get('id')
- if entity.classes is not None:
- classes = dom.get('classes', [()])
- are_classes_match = all(c in classes for c in entity.classes)
- return is_tag_match & is_id_match & are_classes_match
- @staticmethod
- def get_entity(dom_dict: defaultdict):
- """
- Factory method that generates entity out of a given defaultdict (Can be dict too)
- :param dom_dict: dictionary containing tags, identifier and classes
- :return: entity object
- """
- tag, identifier, classes = None, None, None
- if dom_dict.get('tag'):
- tag = dom_dict['tag'][0]
- if dom_dict.get('identifier'):
- identifier = dom_dict['identifier'][0]
- if dom_dict.get('classes'):
- classes = dom_dict['classes']
- return Entity(tag=tag, classes=classes, identifier=identifier)
- class ParserException(Exception):
- pass
- class SelectorUnitParser(object):
- """ Parses the given selector string based on the state machine to form the entity object """
- state_machine = {
- 'tag': {
- 'transition': {'.': 'classes', '#': 'identifier'}
- },
- 'identifier': {
- 'transition': {'.': 'classes'}
- },
- 'classes': {
- 'transition': {'.': 'classes'}
- }
- }
- @classmethod
- def parse(cls, selector_unit: str):
- """
- Return search entity object from the given selector unit string
- :param selector_unit: a selector of the form `tag#id.class1.class2`
- :return: search entity object representation of the selector unit
- """
- data = defaultdict(list)
- current_state = 'tag'
- current_str = ''
- for c in selector_unit:
- if c in cls.state_machine[current_state]['transition']:
- if current_str != '':
- data[current_state].append(current_str)
- current_str = ''
- current_state = cls.state_machine[current_state]['transition'][c]
- elif c.isalnum() or c == '-':
- current_str += c
- else:
- raise ParserException("Invalid character (%s) in selector str: %s" % (c, selector_unit))
- if current_str != '':
- data[current_state].append(current_str)
- entity = Entity.get_entity(data)
- return entity
- class Stack(list):
- """ Simple wrapper for list to support push and peek operations """
- def push(self, obj):
- self.append(obj)
- def peek(self):
- return self[-1]
- def get_search_stack(selector: str):
- """
- Given a search selector string this function constructs a stack of entities that needs to be searched for
- in the hierarchy
- :param selector: search selector string
- :return: stack of all the entities that needs to be searched for in the hierarchy
- """
- selector_units = selector.split()
- search_stack = Stack()
- # push the selector elements into the stack in reversed manner so we get the top
- # NOTE: the stack can be replaced by a queue too.
- for i in reversed(range(len(selector_units))):
- entity = SelectorUnitParser.parse(selector_unit=selector_units[i])
- search_stack.push(entity)
- return search_stack
- def get_matches_in_hierarchy(search_stack, current_dom):
- """
- the core function of the program, does a DFS recursively to figure out the matches in the current tree path
- :param search_stack: the entities that needs to be matched from
- :param current_dom: the current dom element from the hierarchy that needs to be searched with
- :return: the count of number of matches, that is recursively returned and added up
- """
- search_entity = search_stack.peek()
- matches = 0
- if Entity.matches_dom(search_entity, current_dom):
- if len(search_stack) == 1: # if stack is the final element don't pop it, try to find matches in children
- matches = 1
- else: # if not the final element pop it and try to find next match in the stack
- search_stack.pop()
- if current_dom.get('children'):
- for c in current_dom['children']:
- # clone search_stack to find potential matches from the current search stack
- matches += get_matches_in_hierarchy(Stack(search_stack), c)
- return matches
- def process(user_input):
- """
- Iterates through each tests and tries to find the number of matches in the hierarchy
- :param user_input: the input contents from the file
- :return: the list of count of all potential matches
- """
- input_json = json.loads(user_input)
- assert(input_json.get('hierarchy') is not None)
- assert(input_json.get('tests') is not None)
- hierarchy = input_json['hierarchy']
- results = list()
- for selectors in input_json.get('tests'):
- search_stack = get_search_stack(selectors)
- results.append(get_matches_in_hierarchy(search_stack, hierarchy))
- return results
- def driver(main_file=None):
- """
- The main driver function, runs the selector parser
- :param main_file: If a file name is specified read it from the file instead of stdin
- :return: output without whitespaces
- """
- if main_file:
- f = open(input_file, "r")
- else:
- f = sys.stdin
- output = process(f.read())
- print(json.dumps(output, separators=(',', ':')))
- def test():
- """ Some basic tests, can be replaced by a test framework. """
- entity = SelectorUnitParser.parse('body#content')
- assert(entity.tag == 'body')
- assert(entity.identifier == 'content')
- entity = SelectorUnitParser.parse('body#content.foo.bar.zoo.elephant')
- assert(entity.tag == 'body')
- assert(entity.identifier == 'content')
- assert(len(entity.classes) == 4)
- search_stack = get_search_stack('body#content .foo .bar. .zoo. .elephants')
- assert(len(search_stack) == 5)
- assert('elephants' in search_stack[0].classes)
- if __name__ == "__main__":
- input_file = sys.argv[1] if len(sys.argv) > 1 else None
- driver(input_file)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement