Deadly Regular Expressions

What can we learn by using regular expression to do what it cannot do?
  • 25

    SEP
    2014

    Regex Code Equivalency

    #!/usr/bin/env ruby -w
    
    Name = "Gray, James"
    
    !!(Name =~ /\AGray/)      # => true
    Name.start_with?("Gray")  # => true
    
    !!(Name =~ /James\z/)    # => true
    Name.end_with?("James")  # => true
    
    !!(Name =~ /Dana/)     # => false
    Name.include?("Dana")  # => false
    
    !!(Name =~ /\A\z/)  # => false
    Name.empty?         # => false
    
    !!(Name =~ /\AGray, James\z/)  # => true
    Name == "Gray, James"          # => true
    
    !!(Name =~ /\A(?:Gray, James|Gray, Dana)\z/)  # => true
    ["Gray, James", "Gray, Dana"].include?(Name)  # => true
    
    Name =~ /\A\w+/ && $&  # => "Gray"
    Name[/\A\w+/]          # => "Gray"
    
    Name =~ /\A(\w+),\s*(\w+)\z/ && $2  # => "James"
    Name[/\A(\w+),\s*(\w+)\z/, 2]       # => "James"
    
    Name =~ /\A(?<last>\w+),\s*(?<first>\w+)\z/ && $~[:first]  # => "James"
    Name[/\A(?<last>\w+),\s*(?<first>\w+)\z/, :first]          # => "James"
    
    Name.scan(/^.*\n?/)  # => ["Gray, James"]
    Name.lines           # => ["Gray, James"]
    
    Name.scan(/./m)  # => ["G", "r", "a", "y", ",", " ", "J", "a", "m", "e", "s"]
    Name.chars       # => ["G", "r", "a", "y", ",", " ", "J", "a", "m", "e", "s"]
    
    Name.gsub(/[aeiou]/, "")  # => "Gry, Jms"
    Name.delete("aeiou")      # => "Gry, Jms"
    
    Name.gsub(/[aeiou]/, "X") # => "GrXy, JXmXs"
    Name.tr("aeiou", "X")     # => "GrXy, JXmXs"
    
    # For the destructive operations that follow you can drop the `dup()` and
    # switch `sub()` to `sub!()`, as long as you don't care about the return value.
    
    Name.sub(/(?=,)/, " II")                 # => "Gray II, James"
    Name.dup.insert(Name.index(","), " II")  # => "Gray II, James"
    
    Name.sub(/\A/, "Name:  ")    # => "Name:  Gray, James"
    Name.dup.prepend("Name:  ")  # => "Name:  Gray, James"
    
    Name.sub(/\A.*\z/m, "Gray, Dana")  # => "Gray, Dana"
    Name.dup.replace("Gray, Dana")     # => "Gray, Dana"
    
    Name.sub(/\A.*\z/m, "")  # => ""
    Name.dup.clear           # => ""
    
    
    
    Spacey = "\tsome    space\r\n"
    
    Spacey.sub(/\A\s+/, "")  # => "some    space\r\n"
    Spacey.lstrip            # => "some    space\r\n"
    
    Spacey.sub(/\s+\z/, "")  # => "\tsome    space"
    Spacey.rstrip            # => "\tsome    space"
    
    Spacey.sub(/\A\s*(.+?)\s*\z/m, '\1')  # => "some    space"
    Spacey.strip                          # => "some    space"
    
    Spacey.sub(/(?:\r?\n|\r)\z/m, "")  # => "\tsome    space"
    Spacey.chomp                       # => "\tsome    space"
    
    Spacey.sub(/(?:\r\n|.)\z/m, "")  # => "\tsome    space"
    Spacey.chop                      # => "\tsome    space"
    
    Spacey.gsub(/ +/, " ")  # => "\tsome space\r\n"
    Spacey.squeeze(" ")     # => "\tsome space\r\n"
    
  • 22

    SEP
    2014

    A Regex Can't Match Balanced Parentheses

    Can we do math with regular expressions?

    #!/usr/bin/env ruby -w
    
    def build_preparation_regex(number_regex, ops)
      %r{
        (?<number>             #{number_regex}                                   ){0}
        (?<operator>           [#{ops.map(&Regexp.method(:escape)).join}]        ){0}
        (?<term_operator_term> \g<term> \s* \g<operator> \s* \g<term>            ){0}
        (?<term>               \g<number> | \( \s* \g<term_operator_term> \s* \) ){0}
    
        \g<term_operator_term>(?=\s*\z|[^)])
      }x
    end
    
    NUMBER_REGEX               = %r{
      -?            # an optional minus
      \d+           # an integer
      (?: \. \d+)?  # an optional fractional bit
    }x
    PREPARE_MULT_AND_DIV_REGEX = build_preparation_regex(NUMBER_REGEX, %w[* /])
    PREPARE_ADD_AND_SUB_REGEX  = build_preparation_regex(NUMBER_REGEX, %w[* / + -])
    CHECK_REGEX                = %r{
      \A                   # the start of the expression
      (?<term>             # a term, which is:
        #{NUMBER_REGEX}    # a number
        |                  # or
        \( \s*             # a parenthesized group of
          \g<term>         # a term
          \s* [*/+\-] \s*  # an operator
          \g<term>         # and another term
        \s* \)             # the end of the parenthesized group
      )
      \z                   # the end of the expression
    }x
    MATH_REGEX                 = %r{
      \( \s*
      (?<left>     #{NUMBER_REGEX} )
      \s*
      (?<operator> [*/+\-]         )
      \s*
      (?<right>    #{NUMBER_REGEX} )
      \s* \)
    }x
    
    verbose = ARGV.delete("-v")
    problem = ARGV.first.strip or abort "USAGE:  #{$PROGRAM_NAME} MATH_EXPRESSION"
    steps   = [ ]
    
    [PREPARE_MULT_AND_DIV_REGEX, PREPARE_ADD_AND_SUB_REGEX].each do |preparation|
      loop do
        steps << problem.dup if verbose
        problem.sub!(preparation) { |term| "(#{term})" } or break
      end
    end
    
    problem =~ CHECK_REGEX or abort "Error:  Invalid expression"
    
    solution = problem.dup
    loop do
      steps << solution.dup if verbose
      solution.sub!(MATH_REGEX) {
        $~[:left].to_f.public_send($~[:operator], $~[:right].to_f)
      } or break
    end
    
    puts steps.uniq[0..-2] if verbose
    puts solution.sub(/\.0+\z/, "")
    

    Read more…

  • 20

    SEP
    2014

    Can You snake_case/CamelCase With One Regex?

    In Rails, methods like underscore() and camelize() use several regexen to transform the String under the hood. Many people have asked if you can do it with a single regex though. These specs I borrowed from Rails seem to say yes:

    #!/usr/bin/env ruby -w
    
    class String
      def snake_case(acronyms = self.class.acronyms)
        gsub( %r{
          (?:
            (?<before>  \b | [A-Za-z\d]   )
            (?<acronym> #{acronyms.regex} )
            (?<after>   \b | [^a-z]       )
          )
          |
          (?: (?<before> [A-Z]+ ) (?<after> [A-Z][^A-Z] ) )
          |
          (?: (?<before> [^A-Z:] ) (?<after> [A-Z] ) )
          |
          (?<nesting> :: )
        }x ) { |m|
          if $~[:nesting]
            "/"
          else
            [$~[:before], $~[:acronym], $~[:after]]
              .compact
              .reject(&:empty?)
              .join("_")
          end
        }.downcase
      end
    
      def CamelCase(acronyms = self.class.acronyms)
        gsub( %r{
          (?:
            (?: \A | _ | (?<nesting> / ) )
            (?<acronym> #{acronyms.inverted_regex} )
            (?= \b | [A-Z_] )
          )
          |
          (?: (?: \A | _ ) (?<letter> . ) )
          |
          (?: (?<nesting> / ) (?<letter> . ) )
        }mx ) {
          nested      = $~[:nesting] && "::"
          capitalized = acronyms.capitalize($~[:acronym]) { $~[:letter].upcase }
          "#{nested}#{capitalized}"
        }
      end
    
      def camelCase
        self.CamelCase.sub(/\A[A-Z]/) { |first_char| first_char.downcase }
      end
    
      def self.acronyms
        @acronyms ||= AcronymManager.new
      end
    end
    
    class AcronymManager
      NEVER_MATCHES = /\zA/
    
      def initialize
        @acronyms = { }
        @inverted = { }
      end
    
      attr_reader :acronyms, :inverted
      private     :acronyms, :inverted
    
      def add(acronym)
        acronyms[acronym] = acronym.downcase
        @inverted         = acronyms.invert
      end
    
      def regex
        return NEVER_MATCHES if acronyms.empty?
    
        /(?:#{acronyms.keys.map(&Regexp.method(:escape)).join('|')})/
      end
    
      def inverted_regex
        return NEVER_MATCHES if acronyms.empty?
    
        /(?:#{inverted.keys.map(&Regexp.method(:escape)).join('|')})/
      end
    
      def capitalize(acronym, &default)
        inverted.fetch(acronym, &default)
      end
    end
    
    if $PROGRAM_NAME == __FILE__
      require "minitest/autorun"
    
      describe "Case changing" do
        # https://github.com/rails/rails/blob/
        # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/
        # inflector_test_cases.rb#L118-L123
        let(:examples) {
          {
            "Product"               => "product",
            "SpecialGuest"          => "special_guest",
            "ApplicationController" => "application_controller",
            "Area51Controller"      => "area51_controller",
          }
        }
        # https://github.com/rails/rails/blob/
        # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/
        # inflector_test_cases.rb#L139-L145
        let(:one_way_snake_examples) {
          {
            "HTMLTidy"              => "html_tidy",
            "HTMLTidyGenerator"     => "html_tidy_generator",
            "FreeBSD"               => "free_bsd",
            "HTML"                  => "html",
            "ForceXMLController"    => "force_xml_controller"
          }
        }
        # https://github.com/rails/rails/blob/
        # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/
        # inflector_test.rb#L98
        let(:one_way_camel_examples) {
          {
            "CamelCase"             => "Camel_Case"
          }
        }
        # added by James
        let(:path_examples) {
          {
            "SomeLib::WithClass"    => "some_lib/with_class"
          }
        }
        # https://github.com/rails/rails/blob/
        # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/
        # inflector_test.rb#L101-L145
        let(:acronym_examples) {
          {
            "API"                   => "api",
            "APIController"         => "api_controller",
            "Nokogiri::HTML"        => "nokogiri/html",
            "HTTPAPI"               => "http_api",
            "HTTP::Get"             => "http/get",
            "SSLError"              => "ssl_error",
            "RESTful"               => "restful",
            "RESTfulController"     => "restful_controller",
            "Nested::RESTful"       => "nested/restful",
            "IHeartW3C"             => "i_heart_w3c",
            "PhDRequired"           => "phd_required",
            "IRoRU"                 => "i_ror_u",
            "RESTfulHTTPAPI"        => "restful_http_api",
    
            # misdirection
            "Capistrano"            => "capistrano",
            "CapiController"        => "capi_controller",
            "HttpsApis"             => "https_apis",
            "Html5"                 => "html5",
            "Restfully"             => "restfully",
            "RoRails"               => "ro_rails"
          }
        }
    
        it "can snake_case a String" do
          examples.each do |camel, snake|
            camel.snake_case.must_equal(snake)
          end
        end
    
        it "can handle some tricky one-way cases for snake_case" do
          one_way_snake_examples.each do |camel, snake|
            camel.snake_case.must_equal(snake)
          end
        end
    
        it "can CamelCase a String" do
          examples.each do |camel, snake|
            snake.CamelCase.must_equal(camel)
          end
        end
    
        it "can handle some tricky one-way cases for CamelCase" do
          one_way_camel_examples.each do |camel, snakey|
            snakey.CamelCase.must_equal(camel)
          end
        end
    
        it "can camelCase a String" do
          "camel_case".camelCase.must_equal("camelCase")
        end
    
        it "can convert nesting to paths and back" do
          path_examples.each do |camel, snake|
            camel.snake_case.must_equal(snake)
            snake.CamelCase.must_equal(camel)
          end
        end
    
        it "is aware of acronyms" do
          acronyms = AcronymManager.new
          acronyms.add("API")
          acronyms.add("HTML")
          acronyms.add("HTTP")
          acronyms.add("RESTful")
          acronyms.add("W3C")
          acronyms.add("PhD")
          acronyms.add("RoR")
          acronyms.add("SSL")
    
          acronym_examples.each do |camel, snake|
            camel.snake_case(acronyms).must_equal(snake)
            snake.CamelCase(acronyms).must_equal(camel)
          end
        end
      end
    end
    

    Read more…

  • 19

    SEP
    2014

    "You can't parse [X]HTML with regex."

    The only explanation I'll give for the following code it to provide this link to my favorite Stack Overflow answer.

    #!/usr/bin/env ruby -w
    
    require "open-uri"
    
    URL    = "http://stackoverflow.com/questions/1732348/" +
             "regex-match-open-tags-except-xhtml-self-contained-tags"
    PARSER = %r{
      (?<doctype_declaration>
        <!DOCTYPE\b (?<doctype> [^>]* ) >
      ){0}
      (?<comment>
        <!-- .* -->
      ){0}
    
      (?<script_tag>
        < \s* (?<tag_name> script ) \s* (?<attributes> [^>]* > )
          (?<script> .*? )
        < \s* / \s* script \s* >
      ){0}
      (?<self_closed_tag>
        < \s* (?<tag_name> \w+ ) \s* (?<attributes> [^>]* / \s* > )
      ){0}
      (?<unclosed_tag>
        < \s*
        (?<tag_name> link | meta | br | input | hr | img ) \b
        \s*
        (?<attributes> [^>]* > )
      ){0}
      (?<open_tag>
        < \s* (?<tag_name> \w+ ) \s* (?<attributes> [^>]* > )
      ){0}
      (?<close_tag>
        < \s* / \s* (?<tag_name> \w+ ) \s* >
      ){0}
    
      (?<attribute>
        (?<attribute_name> [-\w]+ )
        (?: \s* = \s* (?<attribute_value> "[^"]*" | '[^']*' | [^>\s]+ ) )? \s*
      ){0}
      (?<attribute_list>
        \g<attribute>
        (?= [^>]* > \z )  # attributes keep a trailing > to disambiguate from text
      ){0}
    
      (?<text>
        (?! [^<]* /?\s*> \z )  # a guard to prevent this from parsing attributes
        [^<]+
      ){0}
    
      \G
      (?:
        \g<doctype_declaration>
        |
        \g<comment>
        |
        \g<script_tag>
        |
        \g<self_closed_tag>
        |
        \g<unclosed_tag>
        |
        \g<open_tag>
        |
        \g<attribute_list>
        |
        \g<close_tag>
        |
        \g<text>
      )
      \s*
    }mix
    
    def parse(html)
      stack = [{attributes: [ ], contents: [ ], name: :root}]
      loop do
        html.sub!(PARSER, "") or break
        if $~[:doctype_declaration]
          add_to_tree(stack.last, "DOCTYPE", $~[:doctype].strip)
        elsif $~[:script_tag]
          add_to_stack(stack, $~[:tag_name], $~[:attributes], $~[:script])
        elsif $~[:self_closed_tag] || $~[:unclosed_tag] || $~[:open_tag]
          add_to_stack(stack, $~[:tag_name], $~[:attributes], "", $~[:open_tag])
        elsif $~[:close_tag]
          stack.pop
        elsif $~[:text]
          stack.last[:contents] << $~[:text]
        end
      end
      stack.pop
    end
    
    def add_to_tree(branch, name, value)
      if branch.include?(name)
        branch[name]  = [branch[name]] unless branch[name].is_a?(Array)
        branch[name] << value
      else
        branch[name] = value
      end
    end
    
    def add_to_stack(stack, tag_name, attributes_html, contents, open = false)
      tag = { attributes: parse_attributes(attributes_html),
              contents:   [contents].reject(&:empty?),
              name:       tag_name }
      add_to_tree(stack.last, tag_name, tag)
      stack.last[:contents] << tag
      stack                 << tag if open
    end
    
    def parse_attributes(attributes_html)
      attributes = { }
      loop do
        attributes_html.sub!(PARSER, "") or break
        add_to_tree(
          attributes,
          $~[:attribute_name],
          ($~[:attribute_value] || $~[:attribute_name]).sub(/\A(["'])(.*)\1\z/, '\2')
        )
      end
      attributes
    end
    
    def convert_to_bbcode(node)
      if node.is_a?(Hash)
        name = node[:name].sub(/\Astrike\z/, "s")
        "[#{name}]#{node[:contents].map { |c| send(__method__, c) }.join}[/#{name}]"
      else
        node
      end
    end
    
    html = open(URL, &:read).strip
    ast  = parse(html)
    puts ast["html"]["body"]["div"]
      .find { |div| div[:attributes]["class"] == "container"      }["div"]
      .find { |div| div[:attributes]["id"]    == "content"        }["div"]["div"]
      .find { |div| div[:attributes]["id"]    == "mainbar"        }["div"]
      .find { |div| div[:attributes]["id"]    == "answers"        }["div"]
      .find { |div| div[:attributes]["id"]    == "answer-1732454" }["table"]["tr"]
      .first["td"]
      .find { |div| div[:attributes]["class"] == "answercell"     }["div"]["p"]
      .first[:contents]
      .map(&method(:convert_to_bbcode))  # to reach a wider audience
      .join
    

    Read more…