Gray Soft / Tags / Regular Expression

25

SEP
2014

Regex Code Equivalency

#!/usr/bin/env ruby -w

Name = "Gray, James"

!!(Name =~ /\AGray/)      # => true
Name.start_with?("Gray")  # => true

!!(Name =~ /James\z/)    # => true
Name.end_with?("James")  # => true

!!(Name =~ /Dana/)     # => false
Name.include?("Dana")  # => false

!!(Name =~ /\A\z/)  # => false
Name.empty?         # => false

!!(Name =~ /\AGray, James\z/)  # => true
Name == "Gray, James"          # => true

!!(Name =~ /\A(?:Gray, James|Gray, Dana)\z/)  # => true
["Gray, James", "Gray, Dana"].include?(Name)  # => true

Name =~ /\A\w+/ && $&  # => "Gray"
Name[/\A\w+/]          # => "Gray"

Name =~ /\A(\w+),\s*(\w+)\z/ && $2  # => "James"
Name[/\A(\w+),\s*(\w+)\z/, 2]       # => "James"

Name =~ /\A(?<last>\w+),\s*(?<first>\w+)\z/ && $~[:first]  # => "James"
Name[/\A(?<last>\w+),\s*(?<first>\w+)\z/, :first]          # => "James"

Name.scan(/^.*\n?/)  # => ["Gray, James"]
Name.lines           # => ["Gray, James"]

Name.scan(/./m)  # => ["G", "r", "a", "y", ",", " ", "J", "a", "m", "e", "s"]
Name.chars       # => ["G", "r", "a", "y", ",", " ", "J", "a", "m", "e", "s"]

Name.gsub(/[aeiou]/, "")  # => "Gry, Jms"
Name.delete("aeiou")      # => "Gry, Jms"

Name.gsub(/[aeiou]/, "X") # => "GrXy, JXmXs"
Name.tr("aeiou", "X")     # => "GrXy, JXmXs"

# For the destructive operations that follow you can drop the `dup()` and
# switch `sub()` to `sub!()`, as long as you don't care about the return value.

Name.sub(/(?=,)/, " II")                 # => "Gray II, James"
Name.dup.insert(Name.index(","), " II")  # => "Gray II, James"

Name.sub(/\A/, "Name:  ")    # => "Name:  Gray, James"
Name.dup.prepend("Name:  ")  # => "Name:  Gray, James"

Name.sub(/\A.*\z/m, "Gray, Dana")  # => "Gray, Dana"
Name.dup.replace("Gray, Dana")     # => "Gray, Dana"

Name.sub(/\A.*\z/m, "")  # => ""
Name.dup.clear           # => ""



Spacey = "\tsome    space\r\n"

Spacey.sub(/\A\s+/, "")  # => "some    space\r\n"
Spacey.lstrip            # => "some    space\r\n"

Spacey.sub(/\s+\z/, "")  # => "\tsome    space"
Spacey.rstrip            # => "\tsome    space"

Spacey.sub(/\A\s*(.+?)\s*\z/m, '\1')  # => "some    space"
Spacey.strip                          # => "some    space"

Spacey.sub(/(?:\r?\n|\r)\z/m, "")  # => "\tsome    space"
Spacey.chomp                       # => "\tsome    space"

Spacey.sub(/(?:\r\n|.)\z/m, "")  # => "\tsome    space"
Spacey.chop                      # => "\tsome    space"

Spacey.gsub(/ +/, " ")  # => "\tsome space\r\n"
Spacey.squeeze(" ")     # => "\tsome space\r\n"

In: Deadly Regular Expressions | Tags: Regular Expression, Style & Syntax | 2 Comments

22

SEP
2014

A Regex Can't Match Balanced Parentheses

Can we do math with regular expressions?

#!/usr/bin/env ruby -w

def build_preparation_regex(number_regex, ops)
  %r{
    (?<number>             #{number_regex}                                   ){0}
    (?<operator>           [#{ops.map(&Regexp.method(:escape)).join}]        ){0}
    (?<term_operator_term> \g<term> \s* \g<operator> \s* \g<term>            ){0}
    (?<term>               \g<number> | \( \s* \g<term_operator_term> \s* \) ){0}

    \g<term_operator_term>(?=\s*\z|[^)])
  }x
end

NUMBER_REGEX               = %r{
  -?            # an optional minus
  \d+           # an integer
  (?: \. \d+)?  # an optional fractional bit
}x
PREPARE_MULT_AND_DIV_REGEX = build_preparation_regex(NUMBER_REGEX, %w[* /])
PREPARE_ADD_AND_SUB_REGEX  = build_preparation_regex(NUMBER_REGEX, %w[* / + -])
CHECK_REGEX                = %r{
  \A                   # the start of the expression
  (?<term>             # a term, which is:
    #{NUMBER_REGEX}    # a number
    |                  # or
    \( \s*             # a parenthesized group of
      \g<term>         # a term
      \s* [*/+\-] \s*  # an operator
      \g<term>         # and another term
    \s* \)             # the end of the parenthesized group
  )
  \z                   # the end of the expression
}x
MATH_REGEX                 = %r{
  \( \s*
  (?<left>     #{NUMBER_REGEX} )
  \s*
  (?<operator> [*/+\-]         )
  \s*
  (?<right>    #{NUMBER_REGEX} )
  \s* \)
}x

verbose = ARGV.delete("-v")
problem = ARGV.first.strip or abort "USAGE:  #{$PROGRAM_NAME} MATH_EXPRESSION"
steps   = [ ]

[PREPARE_MULT_AND_DIV_REGEX, PREPARE_ADD_AND_SUB_REGEX].each do |preparation|
  loop do
    steps << problem.dup if verbose
    problem.sub!(preparation) { |term| "(#{term})" } or break
  end
end

problem =~ CHECK_REGEX or abort "Error:  Invalid expression"

solution = problem.dup
loop do
  steps << solution.dup if verbose
  solution.sub!(MATH_REGEX) {
    $~[:left].to_f.public_send($~[:operator], $~[:right].to_f)
  } or break
end

puts steps.uniq[0..-2] if verbose
puts solution.sub(/\.0+\z/, "")

In: Deadly Regular Expressions | Tags: For Fun & Regular Expression | 3 Comments

20

SEP
2014

Can You snake_case/CamelCase With One Regex?

In Rails, methods like underscore() and camelize() use several regexen to transform the String under the hood. Many people have asked if you can do it with a single regex though. These specs I borrowed from Rails seem to say yes:

#!/usr/bin/env ruby -w

class String
  def snake_case(acronyms = self.class.acronyms)
    gsub( %r{
      (?:
        (?<before>  \b | [A-Za-z\d]   )
        (?<acronym> #{acronyms.regex} )
        (?<after>   \b | [^a-z]       )
      )
      |
      (?: (?<before> [A-Z]+ ) (?<after> [A-Z][^A-Z] ) )
      |
      (?: (?<before> [^A-Z:] ) (?<after> [A-Z] ) )
      |
      (?<nesting> :: )
    }x ) { |m|
      if $~[:nesting]
        "/"
      else
        [$~[:before], $~[:acronym], $~[:after]]
          .compact
          .reject(&:empty?)
          .join("_")
      end
    }.downcase
  end

  def CamelCase(acronyms = self.class.acronyms)
    gsub( %r{
      (?:
        (?: \A | _ | (?<nesting> / ) )
        (?<acronym> #{acronyms.inverted_regex} )
        (?= \b | [A-Z_] )
      )
      |
      (?: (?: \A | _ ) (?<letter> . ) )
      |
      (?: (?<nesting> / ) (?<letter> . ) )
    }mx ) {
      nested      = $~[:nesting] && "::"
      capitalized = acronyms.capitalize($~[:acronym]) { $~[:letter].upcase }
      "#{nested}#{capitalized}"
    }
  end

  def camelCase
    self.CamelCase.sub(/\A[A-Z]/) { |first_char| first_char.downcase }
  end

  def self.acronyms
    @acronyms ||= AcronymManager.new
  end
end

class AcronymManager
  NEVER_MATCHES = /\zA/

  def initialize
    @acronyms = { }
    @inverted = { }
  end

  attr_reader :acronyms, :inverted
  private     :acronyms, :inverted

  def add(acronym)
    acronyms[acronym] = acronym.downcase
    @inverted         = acronyms.invert
  end

  def regex
    return NEVER_MATCHES if acronyms.empty?

    /(?:#{acronyms.keys.map(&Regexp.method(:escape)).join('|')})/
  end

  def inverted_regex
    return NEVER_MATCHES if acronyms.empty?

    /(?:#{inverted.keys.map(&Regexp.method(:escape)).join('|')})/
  end

  def capitalize(acronym, &default)
    inverted.fetch(acronym, &default)
  end
end

if $PROGRAM_NAME == __FILE__
  require "minitest/autorun"

  describe "Case changing" do
    # https://github.com/rails/rails/blob/
    # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/
    # inflector_test_cases.rb#L118-L123
    let(:examples) {
      {
        "Product"               => "product",
        "SpecialGuest"          => "special_guest",
        "ApplicationController" => "application_controller",
        "Area51Controller"      => "area51_controller",
      }
    }
    # https://github.com/rails/rails/blob/
    # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/
    # inflector_test_cases.rb#L139-L145
    let(:one_way_snake_examples) {
      {
        "HTMLTidy"              => "html_tidy",
        "HTMLTidyGenerator"     => "html_tidy_generator",
        "FreeBSD"               => "free_bsd",
        "HTML"                  => "html",
        "ForceXMLController"    => "force_xml_controller"
      }
    }
    # https://github.com/rails/rails/blob/
    # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/
    # inflector_test.rb#L98
    let(:one_way_camel_examples) {
      {
        "CamelCase"             => "Camel_Case"
      }
    }
    # added by James
    let(:path_examples) {
      {
        "SomeLib::WithClass"    => "some_lib/with_class"
      }
    }
    # https://github.com/rails/rails/blob/
    # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/
    # inflector_test.rb#L101-L145
    let(:acronym_examples) {
      {
        "API"                   => "api",
        "APIController"         => "api_controller",
        "Nokogiri::HTML"        => "nokogiri/html",
        "HTTPAPI"               => "http_api",
        "HTTP::Get"             => "http/get",
        "SSLError"              => "ssl_error",
        "RESTful"               => "restful",
        "RESTfulController"     => "restful_controller",
        "Nested::RESTful"       => "nested/restful",
        "IHeartW3C"             => "i_heart_w3c",
        "PhDRequired"           => "phd_required",
        "IRoRU"                 => "i_ror_u",
        "RESTfulHTTPAPI"        => "restful_http_api",

        # misdirection
        "Capistrano"            => "capistrano",
        "CapiController"        => "capi_controller",
        "HttpsApis"             => "https_apis",
        "Html5"                 => "html5",
        "Restfully"             => "restfully",
        "RoRails"               => "ro_rails"
      }
    }

    it "can snake_case a String" do
      examples.each do |camel, snake|
        camel.snake_case.must_equal(snake)
      end
    end

    it "can handle some tricky one-way cases for snake_case" do
      one_way_snake_examples.each do |camel, snake|
        camel.snake_case.must_equal(snake)
      end
    end

    it "can CamelCase a String" do
      examples.each do |camel, snake|
        snake.CamelCase.must_equal(camel)
      end
    end

    it "can handle some tricky one-way cases for CamelCase" do
      one_way_camel_examples.each do |camel, snakey|
        snakey.CamelCase.must_equal(camel)
      end
    end

    it "can camelCase a String" do
      "camel_case".camelCase.must_equal("camelCase")
    end

    it "can convert nesting to paths and back" do
      path_examples.each do |camel, snake|
        camel.snake_case.must_equal(snake)
        snake.CamelCase.must_equal(camel)
      end
    end

    it "is aware of acronyms" do
      acronyms = AcronymManager.new
      acronyms.add("API")
      acronyms.add("HTML")
      acronyms.add("HTTP")
      acronyms.add("RESTful")
      acronyms.add("W3C")
      acronyms.add("PhD")
      acronyms.add("RoR")
      acronyms.add("SSL")

      acronym_examples.each do |camel, snake|
        camel.snake_case(acronyms).must_equal(snake)
        snake.CamelCase(acronyms).must_equal(camel)
      end
    end
  end
end

In: Deadly Regular Expressions | Tags: Regular Expression | 0 Comments

19

SEP
2014

"You can't parse [X]HTML with regex."

The only explanation I'll give for the following code it to provide this link to my favorite Stack Overflow answer.

#!/usr/bin/env ruby -w

require "open-uri"

URL    = "http://stackoverflow.com/questions/1732348/" +
         "regex-match-open-tags-except-xhtml-self-contained-tags"
PARSER = %r{
  (?<doctype_declaration>
    <!DOCTYPE\b (?<doctype> [^>]* ) >
  ){0}
  (?<comment>
    <!-- .* -->
  ){0}

  (?<script_tag>
    < \s* (?<tag_name> script ) \s* (?<attributes> [^>]* > )
      (?<script> .*? )
    < \s* / \s* script \s* >
  ){0}
  (?<self_closed_tag>
    < \s* (?<tag_name> \w+ ) \s* (?<attributes> [^>]* / \s* > )
  ){0}
  (?<unclosed_tag>
    < \s*
    (?<tag_name> link | meta | br | input | hr | img ) \b
    \s*
    (?<attributes> [^>]* > )
  ){0}
  (?<open_tag>
    < \s* (?<tag_name> \w+ ) \s* (?<attributes> [^>]* > )
  ){0}
  (?<close_tag>
    < \s* / \s* (?<tag_name> \w+ ) \s* >
  ){0}

  (?<attribute>
    (?<attribute_name> [-\w]+ )
    (?: \s* = \s* (?<attribute_value> "[^"]*" | '[^']*' | [^>\s]+ ) )? \s*
  ){0}
  (?<attribute_list>
    \g<attribute>
    (?= [^>]* > \z )  # attributes keep a trailing > to disambiguate from text
  ){0}

  (?<text>
    (?! [^<]* /?\s*> \z )  # a guard to prevent this from parsing attributes
    [^<]+
  ){0}

  \G
  (?:
    \g<doctype_declaration>
    |
    \g<comment>
    |
    \g<script_tag>
    |
    \g<self_closed_tag>
    |
    \g<unclosed_tag>
    |
    \g<open_tag>
    |
    \g<attribute_list>
    |
    \g<close_tag>
    |
    \g<text>
  )
  \s*
}mix

def parse(html)
  stack = [{attributes: [ ], contents: [ ], name: :root}]
  loop do
    html.sub!(PARSER, "") or break
    if $~[:doctype_declaration]
      add_to_tree(stack.last, "DOCTYPE", $~[:doctype].strip)
    elsif $~[:script_tag]
      add_to_stack(stack, $~[:tag_name], $~[:attributes], $~[:script])
    elsif $~[:self_closed_tag] || $~[:unclosed_tag] || $~[:open_tag]
      add_to_stack(stack, $~[:tag_name], $~[:attributes], "", $~[:open_tag])
    elsif $~[:close_tag]
      stack.pop
    elsif $~[:text]
      stack.last[:contents] << $~[:text]
    end
  end
  stack.pop
end

def add_to_tree(branch, name, value)
  if branch.include?(name)
    branch[name]  = [branch[name]] unless branch[name].is_a?(Array)
    branch[name] << value
  else
    branch[name] = value
  end
end

def add_to_stack(stack, tag_name, attributes_html, contents, open = false)
  tag = { attributes: parse_attributes(attributes_html),
          contents:   [contents].reject(&:empty?),
          name:       tag_name }
  add_to_tree(stack.last, tag_name, tag)
  stack.last[:contents] << tag
  stack                 << tag if open
end

def parse_attributes(attributes_html)
  attributes = { }
  loop do
    attributes_html.sub!(PARSER, "") or break
    add_to_tree(
      attributes,
      $~[:attribute_name],
      ($~[:attribute_value] || $~[:attribute_name]).sub(/\A(["'])(.*)\1\z/, '\2')
    )
  end
  attributes
end

def convert_to_bbcode(node)
  if node.is_a?(Hash)
    name = node[:name].sub(/\Astrike\z/, "s")
    "[#{name}]#{node[:contents].map { |c| send(__method__, c) }.join}[/#{name}]"
  else
    node
  end
end

html = open(URL, &:read).strip
ast  = parse(html)
puts ast["html"]["body"]["div"]
  .find { |div| div[:attributes]["class"] == "container"      }["div"]
  .find { |div| div[:attributes]["id"]    == "content"        }["div"]["div"]
  .find { |div| div[:attributes]["id"]    == "mainbar"        }["div"]
  .find { |div| div[:attributes]["id"]    == "answers"        }["div"]
  .find { |div| div[:attributes]["id"]    == "answer-1732454" }["table"]["tr"]
  .first["td"]
  .find { |div| div[:attributes]["class"] == "answercell"     }["div"]["p"]
  .first[:contents]
  .map(&method(:convert_to_bbcode))  # to reach a wider audience
  .join

In: Deadly Regular Expressions | Tags: For Fun, Parsing & Regular Expression | 1 Comment

2

AUG
2006

RegexpChallenge

Just recently I have been working with two different people to improve their regular expression skills. To help me in this endeavor, I built a trivial little script we have been using in IRb. To get started, you construct a new challenge object and add a couple of challenges:

>> reg_chal = RegexpChallenge.new
No challenges.
=> 
>> reg_chal.challenge("Gray, James", "James", "The names can vary.")
=> nil
>> reg_chal.challenge("abbbbbbbc bc", 10)
=> nil
>> reg_chal.challenge( "    \n\t  ", nil,
?>                     "We want to test for non-space data." )
=> nil
>> reg_chal.challenge( "cogs 9, widgets 12, ...", "12",
?>                     "The numbers can vary." )
=> nil
>> reg_chal.challenge( "I'm a simple sentence, with words.",
?>                     %w[I'm a simple sentence with words] )
=> nil

You can ask for challenges to see what you would like to solve:

>> reg_chal.challenges
Challenge #0:
   Input:  "Gray, James"
  Output:  "James"
    Note:  "The names can vary."
Challenge #1:
   Input:  "abbbbbbbc bc"
  Output:  10
Challenge #2:
   Input:  "    \n\t  "
  Output:  nil
    Note:  "We want to test for non-space data."
Challenge #3:
   Input:  "cogs 9, widgets 12, ..."
  Output:  "12"
    Note:  "The numbers can vary."
Challenge #4:
   Input:  "I'm a simple sentence, with words."
  Output:  ["I'm", "a", "simple", "sentence", "with", "words"]
=> nil