Gray Soft / Deadly Regular Expressions

25

SEP
2014

Regex Code Equivalency

#!/usr/bin/env ruby -w

Name = "Gray, James"

!!(Name =~ /\AGray/)      # => true
Name.start_with?("Gray")  # => true

!!(Name =~ /James\z/)    # => true
Name.end_with?("James")  # => true

!!(Name =~ /Dana/)     # => false
Name.include?("Dana")  # => false

!!(Name =~ /\A\z/)  # => false
Name.empty?         # => false

!!(Name =~ /\AGray, James\z/)  # => true
Name == "Gray, James"          # => true

!!(Name =~ /\A(?:Gray, James|Gray, Dana)\z/)  # => true
["Gray, James", "Gray, Dana"].include?(Name)  # => true

Name =~ /\A\w+/ && $&  # => "Gray"
Name[/\A\w+/]          # => "Gray"

Name =~ /\A(\w+),\s*(\w+)\z/ && $2  # => "James"
Name[/\A(\w+),\s*(\w+)\z/, 2]       # => "James"

Name =~ /\A(?<last>\w+),\s*(?<first>\w+)\z/ && $~[:first]  # => "James"
Name[/\A(?<last>\w+),\s*(?<first>\w+)\z/, :first]          # => "James"

Name.scan(/^.*\n?/)  # => ["Gray, James"]
Name.lines           # => ["Gray, James"]

Name.scan(/./m)  # => ["G", "r", "a", "y", ",", " ", "J", "a", "m", "e", "s"]
Name.chars       # => ["G", "r", "a", "y", ",", " ", "J", "a", "m", "e", "s"]

Name.gsub(/[aeiou]/, "")  # => "Gry, Jms"
Name.delete("aeiou")      # => "Gry, Jms"

Name.gsub(/[aeiou]/, "X") # => "GrXy, JXmXs"
Name.tr("aeiou", "X")     # => "GrXy, JXmXs"

# For the destructive operations that follow you can drop the `dup()` and
# switch `sub()` to `sub!()`, as long as you don't care about the return value.

Name.sub(/(?=,)/, " II")                 # => "Gray II, James"
Name.dup.insert(Name.index(","), " II")  # => "Gray II, James"

Name.sub(/\A/, "Name:  ")    # => "Name:  Gray, James"
Name.dup.prepend("Name:  ")  # => "Name:  Gray, James"

Name.sub(/\A.*\z/m, "Gray, Dana")  # => "Gray, Dana"
Name.dup.replace("Gray, Dana")     # => "Gray, Dana"

Name.sub(/\A.*\z/m, "")  # => ""
Name.dup.clear           # => ""



Spacey = "\tsome    space\r\n"

Spacey.sub(/\A\s+/, "")  # => "some    space\r\n"
Spacey.lstrip            # => "some    space\r\n"

Spacey.sub(/\s+\z/, "")  # => "\tsome    space"
Spacey.rstrip            # => "\tsome    space"

Spacey.sub(/\A\s*(.+?)\s*\z/m, '\1')  # => "some    space"
Spacey.strip                          # => "some    space"

Spacey.sub(/(?:\r?\n|\r)\z/m, "")  # => "\tsome    space"
Spacey.chomp                       # => "\tsome    space"

Spacey.sub(/(?:\r\n|.)\z/m, "")  # => "\tsome    space"
Spacey.chop                      # => "\tsome    space"

Spacey.gsub(/ +/, " ")  # => "\tsome space\r\n"
Spacey.squeeze(" ")     # => "\tsome space\r\n"

In: Deadly Regular Expressions | Tags: Regular Expression, Style & Syntax | 2 Comments

22

SEP
2014

A Regex Can't Match Balanced Parentheses

Can we do math with regular expressions?

#!/usr/bin/env ruby -w

def build_preparation_regex(number_regex, ops)
  %r{
    (?<number>             #{number_regex}                                   ){0}
    (?<operator>           [#{ops.map(&Regexp.method(:escape)).join}]        ){0}
    (?<term_operator_term> \g<term> \s* \g<operator> \s* \g<term>            ){0}
    (?<term>               \g<number> | \( \s* \g<term_operator_term> \s* \) ){0}

    \g<term_operator_term>(?=\s*\z|[^)])
  }x
end

NUMBER_REGEX               = %r{
  -?            # an optional minus
  \d+           # an integer
  (?: \. \d+)?  # an optional fractional bit
}x
PREPARE_MULT_AND_DIV_REGEX = build_preparation_regex(NUMBER_REGEX, %w[* /])
PREPARE_ADD_AND_SUB_REGEX  = build_preparation_regex(NUMBER_REGEX, %w[* / + -])
CHECK_REGEX                = %r{
  \A                   # the start of the expression
  (?<term>             # a term, which is:
    #{NUMBER_REGEX}    # a number
    |                  # or
    \( \s*             # a parenthesized group of
      \g<term>         # a term
      \s* [*/+\-] \s*  # an operator
      \g<term>         # and another term
    \s* \)             # the end of the parenthesized group
  )
  \z                   # the end of the expression
}x
MATH_REGEX                 = %r{
  \( \s*
  (?<left>     #{NUMBER_REGEX} )
  \s*
  (?<operator> [*/+\-]         )
  \s*
  (?<right>    #{NUMBER_REGEX} )
  \s* \)
}x

verbose = ARGV.delete("-v")
problem = ARGV.first.strip or abort "USAGE:  #{$PROGRAM_NAME} MATH_EXPRESSION"
steps   = [ ]

[PREPARE_MULT_AND_DIV_REGEX, PREPARE_ADD_AND_SUB_REGEX].each do |preparation|
  loop do
    steps << problem.dup if verbose
    problem.sub!(preparation) { |term| "(#{term})" } or break
  end
end

problem =~ CHECK_REGEX or abort "Error:  Invalid expression"

solution = problem.dup
loop do
  steps << solution.dup if verbose
  solution.sub!(MATH_REGEX) {
    $~[:left].to_f.public_send($~[:operator], $~[:right].to_f)
  } or break
end

puts steps.uniq[0..-2] if verbose
puts solution.sub(/\.0+\z/, "")

20

SEP
2014

Can You snake_case/CamelCase With One Regex?

In Rails, methods like underscore() and camelize() use several regexen to transform the String under the hood. Many people have asked if you can do it with a single regex though. These specs I borrowed from Rails seem to say yes:

#!/usr/bin/env ruby -w

class String
  def snake_case(acronyms = self.class.acronyms)
    gsub( %r{
      (?:
        (?<before>  \b | [A-Za-z\d]   )
        (?<acronym> #{acronyms.regex} )
        (?<after>   \b | [^a-z]       )
      )
      |
      (?: (?<before> [A-Z]+ ) (?<after> [A-Z][^A-Z] ) )
      |
      (?: (?<before> [^A-Z:] ) (?<after> [A-Z] ) )
      |
      (?<nesting> :: )
    }x ) { |m|
      if $~[:nesting]
        "/"
      else
        [$~[:before], $~[:acronym], $~[:after]]
          .compact
          .reject(&:empty?)
          .join("_")
      end
    }.downcase
  end

  def CamelCase(acronyms = self.class.acronyms)
    gsub( %r{
      (?:
        (?: \A | _ | (?<nesting> / ) )
        (?<acronym> #{acronyms.inverted_regex} )
        (?= \b | [A-Z_] )
      )
      |
      (?: (?: \A | _ ) (?<letter> . ) )
      |
      (?: (?<nesting> / ) (?<letter> . ) )
    }mx ) {
      nested      = $~[:nesting] && "::"
      capitalized = acronyms.capitalize($~[:acronym]) { $~[:letter].upcase }
      "#{nested}#{capitalized}"
    }
  end

  def camelCase
    self.CamelCase.sub(/\A[A-Z]/) { |first_char| first_char.downcase }
  end

  def self.acronyms
    @acronyms ||= AcronymManager.new
  end
end

class AcronymManager
  NEVER_MATCHES = /\zA/

  def initialize
    @acronyms = { }
    @inverted = { }
  end

  attr_reader :acronyms, :inverted
  private     :acronyms, :inverted

  def add(acronym)
    acronyms[acronym] = acronym.downcase
    @inverted         = acronyms.invert
  end

  def regex
    return NEVER_MATCHES if acronyms.empty?

    /(?:#{acronyms.keys.map(&Regexp.method(:escape)).join('|')})/
  end

  def inverted_regex
    return NEVER_MATCHES if acronyms.empty?

    /(?:#{inverted.keys.map(&Regexp.method(:escape)).join('|')})/
  end

  def capitalize(acronym, &default)
    inverted.fetch(acronym, &default)
  end
end

if $PROGRAM_NAME == __FILE__
  require "minitest/autorun"

  describe "Case changing" do
    # https://github.com/rails/rails/blob/
    # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/
    # inflector_test_cases.rb#L118-L123
    let(:examples) {
      {
        "Product"               => "product",
        "SpecialGuest"          => "special_guest",
        "ApplicationController" => "application_controller",
        "Area51Controller"      => "area51_controller",
      }
    }
    # https://github.com/rails/rails/blob/
    # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/
    # inflector_test_cases.rb#L139-L145
    let(:one_way_snake_examples) {
      {
        "HTMLTidy"              => "html_tidy",
        "HTMLTidyGenerator"     => "html_tidy_generator",
        "FreeBSD"               => "free_bsd",
        "HTML"                  => "html",
        "ForceXMLController"    => "force_xml_controller"
      }
    }
    # https://github.com/rails/rails/blob/
    # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/
    # inflector_test.rb#L98
    let(:one_way_camel_examples) {
      {
        "CamelCase"             => "Camel_Case"
      }
    }
    # added by James
    let(:path_examples) {
      {
        "SomeLib::WithClass"    => "some_lib/with_class"
      }
    }
    # https://github.com/rails/rails/blob/
    # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/
    # inflector_test.rb#L101-L145
    let(:acronym_examples) {
      {
        "API"                   => "api",
        "APIController"         => "api_controller",
        "Nokogiri::HTML"        => "nokogiri/html",
        "HTTPAPI"               => "http_api",
        "HTTP::Get"             => "http/get",
        "SSLError"              => "ssl_error",
        "RESTful"               => "restful",
        "RESTfulController"     => "restful_controller",
        "Nested::RESTful"       => "nested/restful",
        "IHeartW3C"             => "i_heart_w3c",
        "PhDRequired"           => "phd_required",
        "IRoRU"                 => "i_ror_u",
        "RESTfulHTTPAPI"        => "restful_http_api",

        # misdirection
        "Capistrano"            => "capistrano",
        "CapiController"        => "capi_controller",
        "HttpsApis"             => "https_apis",
        "Html5"                 => "html5",
        "Restfully"             => "restfully",
        "RoRails"               => "ro_rails"
      }
    }

    it "can snake_case a String" do
      examples.each do |camel, snake|
        camel.snake_case.must_equal(snake)
      end
    end

    it "can handle some tricky one-way cases for snake_case" do
      one_way_snake_examples.each do |camel, snake|
        camel.snake_case.must_equal(snake)
      end
    end

    it "can CamelCase a String" do
      examples.each do |camel, snake|
        snake.CamelCase.must_equal(camel)
      end
    end

    it "can handle some tricky one-way cases for CamelCase" do
      one_way_camel_examples.each do |camel, snakey|
        snakey.CamelCase.must_equal(camel)
      end
    end

    it "can camelCase a String" do
      "camel_case".camelCase.must_equal("camelCase")
    end

    it "can convert nesting to paths and back" do
      path_examples.each do |camel, snake|
        camel.snake_case.must_equal(snake)
        snake.CamelCase.must_equal(camel)
      end
    end

    it "is aware of acronyms" do
      acronyms = AcronymManager.new
      acronyms.add("API")
      acronyms.add("HTML")
      acronyms.add("HTTP")
      acronyms.add("RESTful")
      acronyms.add("W3C")
      acronyms.add("PhD")
      acronyms.add("RoR")
      acronyms.add("SSL")

      acronym_examples.each do |camel, snake|
        camel.snake_case(acronyms).must_equal(snake)
        snake.CamelCase(acronyms).must_equal(camel)
      end
    end
  end
end

19

SEP
2014

"You can't parse [X]HTML with regex."

The only explanation I'll give for the following code it to provide this link to my favorite Stack Overflow answer.

#!/usr/bin/env ruby -w

require "open-uri"

URL    = "http://stackoverflow.com/questions/1732348/" +
         "regex-match-open-tags-except-xhtml-self-contained-tags"
PARSER = %r{
  (?<doctype_declaration>
    <!DOCTYPE\b (?<doctype> [^>]* ) >
  ){0}
  (?<comment>
    <!-- .* -->
  ){0}

  (?<script_tag>
    < \s* (?<tag_name> script ) \s* (?<attributes> [^>]* > )
      (?<script> .*? )
    < \s* / \s* script \s* >
  ){0}
  (?<self_closed_tag>
    < \s* (?<tag_name> \w+ ) \s* (?<attributes> [^>]* / \s* > )
  ){0}
  (?<unclosed_tag>
    < \s*
    (?<tag_name> link | meta | br | input | hr | img ) \b
    \s*
    (?<attributes> [^>]* > )
  ){0}
  (?<open_tag>
    < \s* (?<tag_name> \w+ ) \s* (?<attributes> [^>]* > )
  ){0}
  (?<close_tag>
    < \s* / \s* (?<tag_name> \w+ ) \s* >
  ){0}

  (?<attribute>
    (?<attribute_name> [-\w]+ )
    (?: \s* = \s* (?<attribute_value> "[^"]*" | '[^']*' | [^>\s]+ ) )? \s*
  ){0}
  (?<attribute_list>
    \g<attribute>
    (?= [^>]* > \z )  # attributes keep a trailing > to disambiguate from text
  ){0}

  (?<text>
    (?! [^<]* /?\s*> \z )  # a guard to prevent this from parsing attributes
    [^<]+
  ){0}

  \G
  (?:
    \g<doctype_declaration>
    |
    \g<comment>
    |
    \g<script_tag>
    |
    \g<self_closed_tag>
    |
    \g<unclosed_tag>
    |
    \g<open_tag>
    |
    \g<attribute_list>
    |
    \g<close_tag>
    |
    \g<text>
  )
  \s*
}mix

def parse(html)
  stack = [{attributes: [ ], contents: [ ], name: :root}]
  loop do
    html.sub!(PARSER, "") or break
    if $~[:doctype_declaration]
      add_to_tree(stack.last, "DOCTYPE", $~[:doctype].strip)
    elsif $~[:script_tag]
      add_to_stack(stack, $~[:tag_name], $~[:attributes], $~[:script])
    elsif $~[:self_closed_tag] || $~[:unclosed_tag] || $~[:open_tag]
      add_to_stack(stack, $~[:tag_name], $~[:attributes], "", $~[:open_tag])
    elsif $~[:close_tag]
      stack.pop
    elsif $~[:text]
      stack.last[:contents] << $~[:text]
    end
  end
  stack.pop
end

def add_to_tree(branch, name, value)
  if branch.include?(name)
    branch[name]  = [branch[name]] unless branch[name].is_a?(Array)
    branch[name] << value
  else
    branch[name] = value
  end
end

def add_to_stack(stack, tag_name, attributes_html, contents, open = false)
  tag = { attributes: parse_attributes(attributes_html),
          contents:   [contents].reject(&:empty?),
          name:       tag_name }
  add_to_tree(stack.last, tag_name, tag)
  stack.last[:contents] << tag
  stack                 << tag if open
end

def parse_attributes(attributes_html)
  attributes = { }
  loop do
    attributes_html.sub!(PARSER, "") or break
    add_to_tree(
      attributes,
      $~[:attribute_name],
      ($~[:attribute_value] || $~[:attribute_name]).sub(/\A(["'])(.*)\1\z/, '\2')
    )
  end
  attributes
end

def convert_to_bbcode(node)
  if node.is_a?(Hash)
    name = node[:name].sub(/\Astrike\z/, "s")
    "[#{name}]#{node[:contents].map { |c| send(__method__, c) }.join}[/#{name}]"
  else
    node
  end
end

html = open(URL, &:read).strip
ast  = parse(html)
puts ast["html"]["body"]["div"]
  .find { |div| div[:attributes]["class"] == "container"      }["div"]
  .find { |div| div[:attributes]["id"]    == "content"        }["div"]["div"]
  .find { |div| div[:attributes]["id"]    == "mainbar"        }["div"]
  .find { |div| div[:attributes]["id"]    == "answers"        }["div"]
  .find { |div| div[:attributes]["id"]    == "answer-1732454" }["table"]["tr"]
  .first["td"]
  .find { |div| div[:attributes]["class"] == "answercell"     }["div"]["p"]
  .first[:contents]
  .map(&method(:convert_to_bbcode))  # to reach a wider audience
  .join

Deadly Regular Expressions

25

Regex Code Equivalency

22

A Regex Can't Match Balanced Parentheses

20

Can You snake_case/CamelCase With One Regex?

19

"You can't parse [X]HTML with regex."

Search

Categories

Tags