Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ---
- config:
- debug: 2
- agent: Firefox
- do:
- - link_add:
- pool: paginator
- url: https://www.diggernaut.com/sandbox/
- - walk:
- to: links
- pool: paginator
- do:
- # add next page of paginator to pool
- - find:
- path: 'div#content > ul > li.next > a'
- do:
- - parse:
- attr: href
- - normalize:
- routine: url
- - link_add:
- pool: paginator
- # get total results number
- - find:
- path: h4:contains('total results')
- do:
- - parse:
- filter:
- - results:\s*(\d+)
- - variable_clear: number_of_results
- - variable_set: number_of_results
- - find:
- path: 'div#search-detail > div.result-content'
- do:
- # --[ CLASS ]---------------------------------------------------------------------
- # take name of Class
- - find:
- path: 'h3'
- do:
- - parse
- - variable_clear: class_name
- - variable_set: class_name
- # take description of Class
- - find:
- path: 'div > p'
- do:
- - parse
- - variable_clear: class_description
- - variable_set: class_description
- # take Class Activities
- - find:
- path: 'table > tbody > tr'
- do:
- # prepare new object
- - object_new: post
- # save main information
- - variable_get: number_of_results
- - object_field_set:
- object: post
- field: number_of_results
- - variable_get: class_name
- - object_field_set:
- object: post
- field: class_name
- - variable_get: class_description
- - object_field_set:
- object: post
- field: class_description
- # activity field
- - find:
- path: 'td.col2'
- do:
- - parse
- - object_field_set:
- object: post
- field: activity
- # date field
- - find:
- path: 'td.col5'
- do:
- # date from
- - parse:
- filter:
- - (\d{1,2}\/\d{1,2}\/\d{2,4})\s*-
- - normalize:
- routine: date_format
- args:
- format_in: '%m/%d/%y'
- format_out: '%Y-%m-%d'
- - object_field_set:
- object: post
- field: date_from
- # date to
- - parse:
- filter:
- - \s*-\s*(\d{1,2}\/\d{1,2}\/\d{2,4})
- - normalize:
- routine: date_format
- args:
- format_in: '%m/%d/%y'
- format_out: '%Y-%m-%d'
- - object_field_set:
- object: post
- field: date_to
- # days field
- - find:
- path: 'td.col7'
- do:
- - parse
- - object_field_set:
- object: post
- field: days
- # fees field
- - find:
- path: 'td.col8'
- do:
- - parse
- - object_field_set:
- object: post
- field: fees
- - find:
- # find a link to details page
- path: 'td.col11 > a'
- do:
- - parse:
- attr: href
- - normalize:
- routine: url
- - walk:
- to: value
- do:
- # --[ CLASS DETAILS ]-----------------------------------------------------
- # find 1-st field with gender
- - find:
- path: tr:contains('Gender')
- do:
- - parse:
- filter:
- - Gender:\s*(.*)
- - object_field_set:
- object: post
- field: gender
- # find 2-nd field with age
- - find:
- path: tr:contains('Ages')
- do:
- - parse:
- filter:
- - Ages:\s*(.*)
- - object_field_set:
- object: post
- field: ages
- # find 3-d field with address and phone
- - find:
- path: tr:contains('Dates\\Days\\Times')
- do:
- - find:
- path: 'td:nth-child(2) > table > tbody > tr > td:nth-child(4) > div'
- # taking all divs except first one
- slice: 1:-1
- # merging divs into one block
- merge: div
- do:
- # parsing for phone number by regex
- - parse:
- filter:
- - (\(\d+\)\s*[\d-]+)
- - object_field_set:
- object: post
- field: phone
- # removing last div with phone number
- - node_remove: div:nth-child(3)
- # concatenate two parts of address with comma+space delimiter
- - find:
- path: div
- do:
- - parse
- - object_field_set:
- object: post
- field: address
- joinby: ', '
- # --[ CLASS DETAILS ]-----------------------------------------------------
- - object_save:
- name: post
- # --[ CLASS ]---------------------------------------------------------------------
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement