stew.coffee

Jump To … +

dom-util.coffee predicate-factory.coffee stew.coffee

stew.coffee

fs               = require 'fs'
path             = require 'path'
HOMEDIR          = path.join(__dirname,'..')
LIB_DIR          = if fs.existsSync(path.join(HOMEDIR,'lib-cov')) then path.join(HOMEDIR,'lib-cov') else path.join(HOMEDIR,'lib')
DOMUtil          = require(path.join(LIB_DIR,'dom-util')).DOMUtil
PredicateFactory = require(path.join(LIB_DIR,'predicate-factory')).PredicateFactory

¶

Stew is a DOM selection engine that supports the full CSS selector syntax as well as CSS selectors extended with regular expressions.

Method names that start with _ are subject to change without notice. Other methods may be considered a part of the public API.
```
class Stew
```

The Stew constructor accepts an optional DOMUtil instance (allowing callers to configure the DOMUtil used by Stew).

  constructor:(dom_util)->
      @factory = new PredicateFactory()
      @dom_util = dom_util ? new DOMUtil()

¶

select selects nodes from the given dom that match the given selector.

If selector is a string, it will be parsed as described in the README. Otherwise selector is assumed to be a predicate function (like those generated by PredicateFactory).

If dom is a string, it will be parsed as HTML (using DOMUtil.parse_html, which see). If dom is a single node, the given selector will be applied to it. If dom is an array of nodes, the given selector will be each element in turn.

This results in an array of matching nodes.

If a callback is provided, the resulting array is passed to it (assuming the signature callback(err,nodeset)). Otherwise the resulting array is returned by this function.

Note that when dom is a string, a callback method must be provided. (Since our HTML parsing is asynchronous.) When dom is an object, the callback method is optional (but will be used when present)
```
  select:(dom,selector,callback)->
    if typeof selector is 'string'
      selector = @_parse_selectors(selector)
    if typeof dom is 'string'
      if callback?
        @dom_util.parse_html dom, (err, dom)=>
          if err?
            callback(err)
          else
            callback(null,@_unguarded_select(dom,selector))
      else
        throw new Error('When select is invoked on a string object, the `callback(err,nodeset)` parameter is required.')
    else
      nodeset = @_unguarded_select(dom,selector)
      callback?(null,nodeset)
      return nodeset
```

_unguarded_select is the "inner" method for select. It assumes dom is a node or array of nodes and that predicate is a predicate function. It returns an array of matching nodes. (Generally this method will not be directly called by clients.)

  _unguarded_select:(dom,predicate)->
    result = []
    visit = (node,parent,path,siblings,sib_index)->
      if predicate(node,parent,path,siblings,sib_index)
        result.push node
      return { 'continue':true, 'visit_children':true }
    @dom_util.walk_dom dom, visit:visit
    return result

select_first selects the first node in the given dom that matches the given selector.

It behaves exactly like select (which see) save that it aborts processing as soon as the first matching node is found, and returns a single node rather than an array of nodes.

  select_first:(dom,selector,callback)->
    if typeof selector is 'string'
      selector = @_parse_selectors(selector)
    if typeof dom is 'string'
      if callback?
        @dom_util.parse_html dom, (err, dom)=>
          if err?
            callback(err)
          else
            callback(null,@_unguarded_select_first(dom,selector))
      else
        throw new Error('When select_first is invoked on a string object, the `callback(err,node)` parameter is required.')
    else
      node = @_unguarded_select_first(dom,selector)
      callback?(null,node)
      return node

_unguarded_select_first is the "inner" method for select_first. (Generally this method will not be directly called by clients.)

  _unguarded_select_first:(dom,predicate)->
    result = null
    visit = (node,parent,path,siblings,sib_index)->
      if predicate(node,parent,path,siblings,sib_index)
        result = node
        return { 'continue':false, 'visit_children':false }
      else
        return { 'continue':true, 'visit_children':true }
    @dom_util.walk_dom dom, visit:visit
    return result

¶

_SPLIT_ON_WS_REGEXP is regular expression that is used to split a string of CSS selectors into individual selectors. It is similiar to str.split(/\s/), but: - treats "quoted phrases" (and /regular expressions/) as a single token - also splits on the CSS "operators" of >, +, , and ~ (Shout-out to http://stackoverflow.com/questions/2817646/javascript-split-string-on-space-or-on-quotes-to-array from which this expression was originally derived.)
```
  _SPLIT_ON_WS_REGEXP = /([^\"\/\s,\+>]|(\"[^\"]+\")|(\/[^\/]+\/)|(\[[^\]]*\]))+|[,\+~>]/g
```

_split_on_ws_respecting_quotes is used to split a string of CSS selectors into individual selectors.

  _split_on_ws_respecting_quotes:(selector)->
    result = []
    while true
      token = _SPLIT_ON_WS_REGEXP.exec(selector)
      if token?[0]?
        result.push(token[0])
      else
        break
    return result

_parse_selectors accepts a string containing one or more CSS selectors and returns the corresponding predicate (a boolean-valued function with the signature (node,node_metadata,all_metadata))

  _parse_selectors:(selectors)->
    result = []
    if typeof selectors is 'string'
      selectors = @_split_on_ws_respecting_quotes(selectors)
    child_operator = false # TODO there is probably a more elegant way to handle `>`, `+` and `,` here.
    adjacent_operator = false
    preceding_sibling_operator = false
    or_operator = false
    for selector in selectors
      if selector is '>'
        child_operator = true
      else if selector is '+'
        adjacent_operator = true
      else if selector is '~'
        preceding_sibling_operator = true
      else if selector is ','
        or_operator = true
      else
        predicate = @_parse_selector(selector)
        if child_operator
          result.push( @factory.direct_descendant_predicate( result.pop(), predicate ) )
          child_operator = false
        else if adjacent_operator
          result.push( @factory.adjacent_sibling_predicate( result.pop(), predicate  ) )
          adjacent_operator = false
        else if preceding_sibling_operator
          result.push( @factory.preceding_sibling_predicate( result.pop(), predicate  ) )
          preceding_sibling_operator = false
        else if or_operator
          result.push( @factory.or_predicate( [ result.pop(), predicate ] ) )
          or_operator = false
        else
          result.push( predicate )
    if result.length > 0
      result = @factory.descendant_predicate(result)
    return result

_CSS_SELECTOR_REGEXP is a regular expression for parsing an individual CSS selector (which might include a tag name, an ID, one or more classes, one or more attributes and a pseudo class).

"tag#id.class-one.class-two[name~=\"value with spaces\"]".match(_CSS_SELECTOR_REGEXP)

  #{ TODO: Combine the `id` and `class` rules to make them order-indepedent? (I think CSS specifies the order, but still.)
  #{############################################################################################################################################################################################################################################################################
  #{                                                                                            11                  1           11  11                  1        112   2    2     2     2     2     22 22       3          33                  3                 3 3           #
  #{                     12                  3            4  56                  7          89  01                  2           34  56                  7        890   1    2     3     4     5     67 89       0          12                  3                 4 5           #
  _CSS_SELECTOR_REGEXP: /((\/[^\/]*\/[gmi]*)|(\*|[\w-]+))?(\#((\/[^\/]*\/[gmi]*)|([\w-]+)))?((\.((\/[^\/]*\/[gmi]*)|([\w-]+)))*)((\[((\/[^\/]*\/[gmi]*)|([\w-]+))(((=)|(~=)|(\|=)|(\*=)|(\^=)|(\$=))(("(([^\\"]|(\\"))*)")|((\/[^\/]*\/[gmi]*)|([\w- ]+))))?\])*)(:([\w-]+))?/ #
  #{                      \-name--------------------------/|\-id-----------------------------/\-class(es)-----------------------/||  \-attr-name-----------------/|\-operator----------------------/\-value-----------------------------------------------/|  | |\-pseduo--/   #
  #{                                                                                                                             ||                               \-operator-and-value---------------------------------------------------------------------/  | |              #
  #{                                                                                                                             |\-attr-clause-([])----------------------------------------------------------------------------------------------------------/ |              #
  #{                                                                                                                             \-attr-clauses-([][]...)-------------------------------------------------------------------------------------------------------/              #
  #{############################################################################################################################################################################################################################################################################

Indices of the important captured groups.

  _NAME         = 1
  _ID           = 4
  _CLASSES      = 8
  _ATTRIBUTES   = 13
  _PSEUDO_CLASS = 35

_ATTRIBUTE_CLAUSE_REGEXP is a regular expression used to split one or more [<name> <op> <value>] expressions into individual components.

  #{###########################################################################################################################################################
  #{                                                                          1     1     1     11 11       1          11                  2                  #
  #{                         1  23                  4        567   8    9     0     1     2     34 56       7          89                  0                  #
  _ATTRIBUTE_CLAUSE_REGEXP: /(\[((\/[^\/]*\/[gmi]*)|([\w-]+))(((=)|(~=)|(\|=)|(\*=)|(\^=)|(\$=))(("(([^\\"]|(\\"))*)")|((\/[^\/]*\/[gmi]*)|([\w- ]+))))?\])/g #
  #{                            \-name----------------------/|\-operator-----------------------/\-value-----------------------------------------------/|      #
  #{                                                         \-operator-and-value----------------------------------------------------------------------/      #
  #{###########################################################################################################################################################

Indices of the important captured groups.

  _ATTR_NAME              = 2
  _OPERATOR               = 6
  _DEQUOTED_ATTR_VALUE    = 15
  _NEVERQUOTED_ATTR_VALUE = 18

_parse_selector returns a (possibly compound) predicate that matches the provided selector (string).

  _parse_selector:(selector)->
    match = @_CSS_SELECTOR_REGEXP.exec(selector)
    clauses = []

The name part.

    if match[_NAME]?
      if match[_NAME] is '*'
        clauses.push(@factory.any_tag_predicate())
      else
        clauses.push(@factory.by_tag_predicate(@_to_string_or_regex(match[_NAME])))

The ID part.

    if match[_ID]?
      clauses.push(@factory.by_id_predicate(@_to_string_or_regex(match[_ID].substring(1))))

One or more class parts.

    if match[_CLASSES]?.length > 0    # match[CLASSES] contains something like `.foo.bar`
      cs = match[_CLASSES].split('.') # split the string into individual class names
      cs.shift()                      # and skip the first (empty) token that is included
      for c in cs
        clauses.push(@factory.by_class_predicate(@_to_string_or_regex(c)))

TODO FIXME Support for *=, ^= and $= is kinda hacked-in here. Refactor to be more DRY. One or more attribute parts.

    if match[_ATTRIBUTES]?.length > 0 # match[_ATTRIBUTES] contains one or more `[name=value]` (or `[name]`) strings
      attr_match = @_ATTRIBUTE_CLAUSE_REGEXP.exec(match[_ATTRIBUTES])
      while attr_match?
        if attr_match[_ATTR_NAME]? and (not attr_match[_OPERATOR]?)
          clauses.push(@factory.by_attr_exists_predicate(@_to_string_or_regex(attr_match[_ATTR_NAME])))
        if attr_match[_ATTR_NAME]? and attr_match[_OPERATOR]? and (attr_match[_DEQUOTED_ATTR_VALUE]? or attr_match[_NEVERQUOTED_ATTR_VALUE]?)
          delim = null
          if attr_match[_OPERATOR] is '~='
            delim = /\s+/
          if attr_match[_OPERATOR] is '|='
            clauses.push(
              @factory.by_attr_value_pipe_equals(
                @_to_string_or_regex(attr_match[_ATTR_NAME]),
                @_to_string_or_regex(attr_match[_DEQUOTED_ATTR_VALUE] ? attr_match[_NEVERQUOTED_ATTR_VALUE])
              )
            )
          else if attr_match[_OPERATOR] is '^=' # starts with
            aval = @_to_string_or_regex(attr_match[_DEQUOTED_ATTR_VALUE] ? attr_match[_NEVERQUOTED_ATTR_VALUE])
            if typeof aval is 'string'
              regexp_source = @factory._escape_for_regexp(aval)
              aval = new RegExp("^#{regexp_source}")
            else
              regexp_source = aval.source
              modifier = ''
              modifier += 'i' if aval.ignoreCase
              modifier += 'g' if aval.global
              modifier += 'm' if aval.multiline
              unless /^\^/.test regexp_source
                aval = new RegExp("^#{regexp_source}")
            clauses.push(@factory.by_attr_value_predicate(@_to_string_or_regex(attr_match[_ATTR_NAME]),aval))
          else if attr_match[_OPERATOR] is '$=' # ends with
            aval = @_to_string_or_regex(attr_match[_DEQUOTED_ATTR_VALUE] ? attr_match[_NEVERQUOTED_ATTR_VALUE])
            if typeof aval is 'string'
              regexp_source = @factory._escape_for_regexp(aval)
              aval = new RegExp("#{regexp_source}$")
            else
              regexp_source = aval.source
              modifier = ''
              modifier += 'i' if aval.ignoreCase
              modifier += 'g' if aval.global
              modifier += 'm' if aval.multiline
              unless /\$$/.test regexp_source
                aval = new RegExp("#{regexp_source}$")
            clauses.push(@factory.by_attr_value_predicate(@_to_string_or_regex(attr_match[_ATTR_NAME]),aval))
          else if attr_match[_OPERATOR] is '*=' # contains
            aval = @_to_string_or_regex(attr_match[_DEQUOTED_ATTR_VALUE] ? attr_match[_NEVERQUOTED_ATTR_VALUE])
            if typeof aval is 'string'
              regexp_source = @factory._escape_for_regexp(aval)
              aval = new RegExp(regexp_source)
            clauses.push(@factory.by_attr_value_predicate(@_to_string_or_regex(attr_match[_ATTR_NAME]),aval))
          else
            clauses.push(
              @factory.by_attr_value_predicate(
                @_to_string_or_regex(attr_match[_ATTR_NAME]),
                @_to_string_or_regex(attr_match[_DEQUOTED_ATTR_VALUE] ? attr_match[_NEVERQUOTED_ATTR_VALUE]),
                delim
              )
            )
        attr_match = @_ATTRIBUTE_CLAUSE_REGEXP.exec(match[_ATTRIBUTES])

The pseudo-class part.

    if match[_PSEUDO_CLASS]?
      if match[_PSEUDO_CLASS] is 'first-child'
        clauses.push(@factory.first_child_predicate())

Combine them with and if needed.

    if clauses.length > 0
      clauses = @factory.and_predicate(clauses)

    return clauses

_to_string_or_regex converts a string that starts and ends with / (with an optional g, m or i suffix) into a regular expression, and otherwise returns the original str value.

  _to_string_or_regex:(str)->
    match = str.match /^\/(.*)\/([gmi]*)$/
    if match?[1]?
      return new RegExp(match[1],match[2])
    else
      return str

Public API includes Stew and DOMUtil

exports = exports ? this
exports.Stew = Stew
exports.DOMUtil = DOMUtil