• Jump To … +
    dom-util.coffee predicate-factory.coffee stew.coffee
  • stew.coffee

  • ¶
    fs               = require 'fs'
    path             = require 'path'
    HOMEDIR          = path.join(__dirname,'..')
    LIB_DIR          = if fs.existsSync(path.join(HOMEDIR,'lib-cov')) then path.join(HOMEDIR,'lib-cov') else path.join(HOMEDIR,'lib')
    DOMUtil          = require(path.join(LIB_DIR,'dom-util')).DOMUtil
    PredicateFactory = require(path.join(LIB_DIR,'predicate-factory')).PredicateFactory
  • ¶

    Stew is a DOM selection engine that supports the full CSS selector syntax as well as CSS selectors extended with regular expressions.

    Method names that start with _ are subject to change without notice. Other methods may be considered a part of the public API.

    class Stew
  • ¶

    The Stew constructor accepts an optional DOMUtil instance (allowing callers to configure the DOMUtil used by Stew).

      constructor:(dom_util)->
          @factory = new PredicateFactory()
          @dom_util = dom_util ? new DOMUtil()
  • ¶

    select selects nodes from the given dom that match the given selector.

    If selector is a string, it will be parsed as described in the README. Otherwise selector is assumed to be a predicate function (like those generated by PredicateFactory).

    If dom is a string, it will be parsed as HTML (using DOMUtil.parse_html, which see). If dom is a single node, the given selector will be applied to it. If dom is an array of nodes, the given selector will be each element in turn.

    This results in an array of matching nodes.

    If a callback is provided, the resulting array is passed to it (assuming the signature callback(err,nodeset)). Otherwise the resulting array is returned by this function.

    Note that when dom is a string, a callback method must be provided. (Since our HTML parsing is asynchronous.) When dom is an object, the callback method is optional (but will be used when present)

      select:(dom,selector,callback)->
        if typeof selector is 'string'
          selector = @_parse_selectors(selector)
        if typeof dom is 'string'
          if callback?
            @dom_util.parse_html dom, (err, dom)=>
              if err?
                callback(err)
              else
                callback(null,@_unguarded_select(dom,selector))
          else
            throw new Error('When select is invoked on a string object, the `callback(err,nodeset)` parameter is required.')
        else
          nodeset = @_unguarded_select(dom,selector)
          callback?(null,nodeset)
          return nodeset
  • ¶

    _unguarded_select is the "inner" method for select. It assumes dom is a node or array of nodes and that predicate is a predicate function. It returns an array of matching nodes. (Generally this method will not be directly called by clients.)

      _unguarded_select:(dom,predicate)->
        result = []
        visit = (node,parent,path,siblings,sib_index)->
          if predicate(node,parent,path,siblings,sib_index)
            result.push node
          return { 'continue':true, 'visit_children':true }
        @dom_util.walk_dom dom, visit:visit
        return result
  • ¶

    select_first selects the first node in the given dom that matches the given selector.

    It behaves exactly like select (which see) save that it aborts processing as soon as the first matching node is found, and returns a single node rather than an array of nodes.

      select_first:(dom,selector,callback)->
        if typeof selector is 'string'
          selector = @_parse_selectors(selector)
        if typeof dom is 'string'
          if callback?
            @dom_util.parse_html dom, (err, dom)=>
              if err?
                callback(err)
              else
                callback(null,@_unguarded_select_first(dom,selector))
          else
            throw new Error('When select_first is invoked on a string object, the `callback(err,node)` parameter is required.')
        else
          node = @_unguarded_select_first(dom,selector)
          callback?(null,node)
          return node
  • ¶

    _unguarded_select_first is the "inner" method for select_first. (Generally this method will not be directly called by clients.)

      _unguarded_select_first:(dom,predicate)->
        result = null
        visit = (node,parent,path,siblings,sib_index)->
          if predicate(node,parent,path,siblings,sib_index)
            result = node
            return { 'continue':false, 'visit_children':false }
          else
            return { 'continue':true, 'visit_children':true }
        @dom_util.walk_dom dom, visit:visit
        return result
  • ¶

    _SPLIT_ON_WS_REGEXP is regular expression that is used to split a string of CSS selectors into individual selectors. It is similiar to str.split(/\s/), but: - treats "quoted phrases" (and /regular expressions/) as a single token - also splits on the CSS "operators" of >, +, , and ~ (Shout-out to http://stackoverflow.com/questions/2817646/javascript-split-string-on-space-or-on-quotes-to-array from which this expression was originally derived.)

      _SPLIT_ON_WS_REGEXP = /([^\"\/\s,\+>]|(\"[^\"]+\")|(\/[^\/]+\/)|(\[[^\]]*\]))+|[,\+~>]/g
  • ¶

    _split_on_ws_respecting_quotes is used to split a string of CSS selectors into individual selectors.

      _split_on_ws_respecting_quotes:(selector)->
        result = []
        while true
          token = _SPLIT_ON_WS_REGEXP.exec(selector)
          if token?[0]?
            result.push(token[0])
          else
            break
        return result
  • ¶

    _parse_selectors accepts a string containing one or more CSS selectors and returns the corresponding predicate (a boolean-valued function with the signature (node,node_metadata,all_metadata))

      _parse_selectors:(selectors)->
        result = []
        if typeof selectors is 'string'
          selectors = @_split_on_ws_respecting_quotes(selectors)
        child_operator = false # TODO there is probably a more elegant way to handle `>`, `+` and `,` here.
        adjacent_operator = false
        preceding_sibling_operator = false
        or_operator = false
        for selector in selectors
          if selector is '>'
            child_operator = true
          else if selector is '+'
            adjacent_operator = true
          else if selector is '~'
            preceding_sibling_operator = true
          else if selector is ','
            or_operator = true
          else
            predicate = @_parse_selector(selector)
            if child_operator
              result.push( @factory.direct_descendant_predicate( result.pop(), predicate ) )
              child_operator = false
            else if adjacent_operator
              result.push( @factory.adjacent_sibling_predicate( result.pop(), predicate  ) )
              adjacent_operator = false
            else if preceding_sibling_operator
              result.push( @factory.preceding_sibling_predicate( result.pop(), predicate  ) )
              preceding_sibling_operator = false
            else if or_operator
              result.push( @factory.or_predicate( [ result.pop(), predicate ] ) )
              or_operator = false
            else
              result.push( predicate )
        if result.length > 0
          result = @factory.descendant_predicate(result)
        return result
  • ¶

    _CSS_SELECTOR_REGEXP is a regular expression for parsing an individual CSS selector (which might include a tag name, an ID, one or more classes, one or more attributes and a pseudo class).

    "tag#id.class-one.class-two[name~=\"value with spaces\"]".match(_CSS_SELECTOR_REGEXP)

      #{ TODO: Combine the `id` and `class` rules to make them order-indepedent? (I think CSS specifies the order, but still.)
      #{############################################################################################################################################################################################################################################################################
      #{                                                                                            11                  1           11  11                  1        112   2    2     2     2     2     22 22       3          33                  3                 3 3           #
      #{                     12                  3            4  56                  7          89  01                  2           34  56                  7        890   1    2     3     4     5     67 89       0          12                  3                 4 5           #
      _CSS_SELECTOR_REGEXP: /((\/[^\/]*\/[gmi]*)|(\*|[\w-]+))?(\#((\/[^\/]*\/[gmi]*)|([\w-]+)))?((\.((\/[^\/]*\/[gmi]*)|([\w-]+)))*)((\[((\/[^\/]*\/[gmi]*)|([\w-]+))(((=)|(~=)|(\|=)|(\*=)|(\^=)|(\$=))(("(([^\\"]|(\\"))*)")|((\/[^\/]*\/[gmi]*)|([\w- ]+))))?\])*)(:([\w-]+))?/ #
      #{                      \-name--------------------------/|\-id-----------------------------/\-class(es)-----------------------/||  \-attr-name-----------------/|\-operator----------------------/\-value-----------------------------------------------/|  | |\-pseduo--/   #
      #{                                                                                                                             ||                               \-operator-and-value---------------------------------------------------------------------/  | |              #
      #{                                                                                                                             |\-attr-clause-([])----------------------------------------------------------------------------------------------------------/ |              #
      #{                                                                                                                             \-attr-clauses-([][]...)-------------------------------------------------------------------------------------------------------/              #
      #{############################################################################################################################################################################################################################################################################
  • ¶

    Indices of the important captured groups.

      _NAME         = 1
      _ID           = 4
      _CLASSES      = 8
      _ATTRIBUTES   = 13
      _PSEUDO_CLASS = 35
  • ¶

    _ATTRIBUTE_CLAUSE_REGEXP is a regular expression used to split one or more [<name> <op> <value>] expressions into individual components.

      #{###########################################################################################################################################################
      #{                                                                          1     1     1     11 11       1          11                  2                  #
      #{                         1  23                  4        567   8    9     0     1     2     34 56       7          89                  0                  #
      _ATTRIBUTE_CLAUSE_REGEXP: /(\[((\/[^\/]*\/[gmi]*)|([\w-]+))(((=)|(~=)|(\|=)|(\*=)|(\^=)|(\$=))(("(([^\\"]|(\\"))*)")|((\/[^\/]*\/[gmi]*)|([\w- ]+))))?\])/g #
      #{                            \-name----------------------/|\-operator-----------------------/\-value-----------------------------------------------/|      #
      #{                                                         \-operator-and-value----------------------------------------------------------------------/      #
      #{###########################################################################################################################################################
  • ¶

    Indices of the important captured groups.

      _ATTR_NAME              = 2
      _OPERATOR               = 6
      _DEQUOTED_ATTR_VALUE    = 15
      _NEVERQUOTED_ATTR_VALUE = 18
  • ¶

    _parse_selector returns a (possibly compound) predicate that matches the provided selector (string).

      _parse_selector:(selector)->
        match = @_CSS_SELECTOR_REGEXP.exec(selector)
        clauses = []
  • ¶

    The name part.

        if match[_NAME]?
          if match[_NAME] is '*'
            clauses.push(@factory.any_tag_predicate())
          else
            clauses.push(@factory.by_tag_predicate(@_to_string_or_regex(match[_NAME])))
  • ¶

    The ID part.

        if match[_ID]?
          clauses.push(@factory.by_id_predicate(@_to_string_or_regex(match[_ID].substring(1))))
  • ¶

    One or more class parts.

        if match[_CLASSES]?.length > 0    # match[CLASSES] contains something like `.foo.bar`
          cs = match[_CLASSES].split('.') # split the string into individual class names
          cs.shift()                      # and skip the first (empty) token that is included
          for c in cs
            clauses.push(@factory.by_class_predicate(@_to_string_or_regex(c)))
  • ¶

    TODO FIXME Support for *=, ^= and $= is kinda hacked-in here. Refactor to be more DRY. One or more attribute parts.

        if match[_ATTRIBUTES]?.length > 0 # match[_ATTRIBUTES] contains one or more `[name=value]` (or `[name]`) strings
          attr_match = @_ATTRIBUTE_CLAUSE_REGEXP.exec(match[_ATTRIBUTES])
          while attr_match?
            if attr_match[_ATTR_NAME]? and (not attr_match[_OPERATOR]?)
              clauses.push(@factory.by_attr_exists_predicate(@_to_string_or_regex(attr_match[_ATTR_NAME])))
            if attr_match[_ATTR_NAME]? and attr_match[_OPERATOR]? and (attr_match[_DEQUOTED_ATTR_VALUE]? or attr_match[_NEVERQUOTED_ATTR_VALUE]?)
              delim = null
              if attr_match[_OPERATOR] is '~='
                delim = /\s+/
              if attr_match[_OPERATOR] is '|='
                clauses.push(
                  @factory.by_attr_value_pipe_equals(
                    @_to_string_or_regex(attr_match[_ATTR_NAME]),
                    @_to_string_or_regex(attr_match[_DEQUOTED_ATTR_VALUE] ? attr_match[_NEVERQUOTED_ATTR_VALUE])
                  )
                )
              else if attr_match[_OPERATOR] is '^=' # starts with
                aval = @_to_string_or_regex(attr_match[_DEQUOTED_ATTR_VALUE] ? attr_match[_NEVERQUOTED_ATTR_VALUE])
                if typeof aval is 'string'
                  regexp_source = @factory._escape_for_regexp(aval)
                  aval = new RegExp("^#{regexp_source}")
                else
                  regexp_source = aval.source
                  modifier = ''
                  modifier += 'i' if aval.ignoreCase
                  modifier += 'g' if aval.global
                  modifier += 'm' if aval.multiline
                  unless /^\^/.test regexp_source
                    aval = new RegExp("^#{regexp_source}")
                clauses.push(@factory.by_attr_value_predicate(@_to_string_or_regex(attr_match[_ATTR_NAME]),aval))
              else if attr_match[_OPERATOR] is '$=' # ends with
                aval = @_to_string_or_regex(attr_match[_DEQUOTED_ATTR_VALUE] ? attr_match[_NEVERQUOTED_ATTR_VALUE])
                if typeof aval is 'string'
                  regexp_source = @factory._escape_for_regexp(aval)
                  aval = new RegExp("#{regexp_source}$")
                else
                  regexp_source = aval.source
                  modifier = ''
                  modifier += 'i' if aval.ignoreCase
                  modifier += 'g' if aval.global
                  modifier += 'm' if aval.multiline
                  unless /\$$/.test regexp_source
                    aval = new RegExp("#{regexp_source}$")
                clauses.push(@factory.by_attr_value_predicate(@_to_string_or_regex(attr_match[_ATTR_NAME]),aval))
              else if attr_match[_OPERATOR] is '*=' # contains
                aval = @_to_string_or_regex(attr_match[_DEQUOTED_ATTR_VALUE] ? attr_match[_NEVERQUOTED_ATTR_VALUE])
                if typeof aval is 'string'
                  regexp_source = @factory._escape_for_regexp(aval)
                  aval = new RegExp(regexp_source)
                clauses.push(@factory.by_attr_value_predicate(@_to_string_or_regex(attr_match[_ATTR_NAME]),aval))
              else
                clauses.push(
                  @factory.by_attr_value_predicate(
                    @_to_string_or_regex(attr_match[_ATTR_NAME]),
                    @_to_string_or_regex(attr_match[_DEQUOTED_ATTR_VALUE] ? attr_match[_NEVERQUOTED_ATTR_VALUE]),
                    delim
                  )
                )
            attr_match = @_ATTRIBUTE_CLAUSE_REGEXP.exec(match[_ATTRIBUTES])
  • ¶

    The pseudo-class part.

        if match[_PSEUDO_CLASS]?
          if match[_PSEUDO_CLASS] is 'first-child'
            clauses.push(@factory.first_child_predicate())
  • ¶

    Combine them with and if needed.

        if clauses.length > 0
          clauses = @factory.and_predicate(clauses)
    
        return clauses
  • ¶

    _to_string_or_regex converts a string that starts and ends with / (with an optional g, m or i suffix) into a regular expression, and otherwise returns the original str value.

      _to_string_or_regex:(str)->
        match = str.match /^\/(.*)\/([gmi]*)$/
        if match?[1]?
          return new RegExp(match[1],match[2])
        else
          return str
  • ¶

    Public API includes Stew and DOMUtil

    exports = exports ? this
    exports.Stew = Stew
    exports.DOMUtil = DOMUtil