Regular Expression

Posts tagged with "Regular Expression."
  • 22

    SEP
    2014

    A Regex Can't Match Balanced Parentheses

    Can we do math with regular expressions?

    #!/usr/bin/env ruby -w
    
    def build_preparation_regex(number_regex, ops)
      %r{
        (?<number>             #{number_regex}                                   ){0}
        (?<operator>           [#{ops.map(&Regexp.method(:escape)).join}]        ){0}
        (?<term_operator_term> \g<term> \s* \g<operator> \s* \g<term>            ){0}
        (?<term>               \g<number> | \( \s* \g<term_operator_term> \s* \) ){0}
    
        \g<term_operator_term>(?=\s*\z|[^)])
      }x
    end
    
    NUMBER_REGEX               = %r{
      -?            # an optional minus
      \d+           # an integer
      (?: \. \d+)?  # an optional fractional bit
    }x
    PREPARE_MULT_AND_DIV_REGEX = build_preparation_regex(NUMBER_REGEX, %w[* /])
    PREPARE_ADD_AND_SUB_REGEX  = build_preparation_regex(NUMBER_REGEX, %w[* / + -])
    CHECK_REGEX                = %r{
      \A                   # the start of the expression
      (?<term>             # a term, which is:
        #{NUMBER_REGEX}    # a number
        |                  # or
        \( \s*             # a parenthesized group of
          \g<term>         # a term
          \s* [*/+\-] \s*  # an operator
          \g<term>         # and another term
        \s* \)             # the end of the parenthesized group
      )
      \z                   # the end of the expression
    }x
    MATH_REGEX                 = %r{
      \( \s*
      (?<left>     #{NUMBER_REGEX} )
      \s*
      (?<operator> [*/+\-]         )
      \s*
      (?<right>    #{NUMBER_REGEX} )
      \s* \)
    }x
    
    verbose = ARGV.delete("-v")
    problem = ARGV.first.strip or abort "USAGE:  #{$PROGRAM_NAME} MATH_EXPRESSION"
    steps   = [ ]
    
    [PREPARE_MULT_AND_DIV_REGEX, PREPARE_ADD_AND_SUB_REGEX].each do |preparation|
      loop do
        steps << problem.dup if verbose
        problem.sub!(preparation) { |term| "(#{term})" } or break
      end
    end
    
    problem =~ CHECK_REGEX or abort "Error:  Invalid expression"
    
    solution = problem.dup
    loop do
      steps << solution.dup if verbose
      solution.sub!(MATH_REGEX) {
        $~[:left].to_f.public_send($~[:operator], $~[:right].to_f)
      } or break
    end
    
    puts steps.uniq[0..-2] if verbose
    puts solution.sub(/\.0+\z/, "")
    

    Read more…

  • 20

    SEP
    2014

    Can You snake_case/CamelCase With One Regex?

    In Rails, methods like underscore() and camelize() use several regexen to transform the String under the hood. Many people have asked if you can do it with a single regex though. These specs I borrowed from Rails seem to say yes:

    #!/usr/bin/env ruby -w
    
    class String
      def snake_case(acronyms = self.class.acronyms)
        gsub( %r{
          (?:
            (?<before>  \b | [A-Za-z\d]   )
            (?<acronym> #{acronyms.regex} )
            (?<after>   \b | [^a-z]       )
          )
          |
          (?: (?<before> [A-Z]+ ) (?<after> [A-Z][^A-Z] ) )
          |
          (?: (?<before> [^A-Z:] ) (?<after> [A-Z] ) )
          |
          (?<nesting> :: )
        }x ) { |m|
          if $~[:nesting]
            "/"
          else
            [$~[:before], $~[:acronym], $~[:after]]
              .compact
              .reject(&:empty?)
              .join("_")
          end
        }.downcase
      end
    
      def CamelCase(acronyms = self.class.acronyms)
        gsub( %r{
          (?:
            (?: \A | _ | (?<nesting> / ) )
            (?<acronym> #{acronyms.inverted_regex} )
            (?= \b | [A-Z_] )
          )
          |
          (?: (?: \A | _ ) (?<letter> . ) )
          |
          (?: (?<nesting> / ) (?<letter> . ) )
        }mx ) {
          nested      = $~[:nesting] && "::"
          capitalized = acronyms.capitalize($~[:acronym]) { $~[:letter].upcase }
          "#{nested}#{capitalized}"
        }
      end
    
      def camelCase
        self.CamelCase.sub(/\A[A-Z]/) { |first_char| first_char.downcase }
      end
    
      def self.acronyms
        @acronyms ||= AcronymManager.new
      end
    end
    
    class AcronymManager
      NEVER_MATCHES = /\zA/
    
      def initialize
        @acronyms = { }
        @inverted = { }
      end
    
      attr_reader :acronyms, :inverted
      private     :acronyms, :inverted
    
      def add(acronym)
        acronyms[acronym] = acronym.downcase
        @inverted         = acronyms.invert
      end
    
      def regex
        return NEVER_MATCHES if acronyms.empty?
    
        /(?:#{acronyms.keys.map(&Regexp.method(:escape)).join('|')})/
      end
    
      def inverted_regex
        return NEVER_MATCHES if acronyms.empty?
    
        /(?:#{inverted.keys.map(&Regexp.method(:escape)).join('|')})/
      end
    
      def capitalize(acronym, &default)
        inverted.fetch(acronym, &default)
      end
    end
    
    if $PROGRAM_NAME == __FILE__
      require "minitest/autorun"
    
      describe "Case changing" do
        # https://github.com/rails/rails/blob/
        # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/
        # inflector_test_cases.rb#L118-L123
        let(:examples) {
          {
            "Product"               => "product",
            "SpecialGuest"          => "special_guest",
            "ApplicationController" => "application_controller",
            "Area51Controller"      => "area51_controller",
          }
        }
        # https://github.com/rails/rails/blob/
        # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/
        # inflector_test_cases.rb#L139-L145
        let(:one_way_snake_examples) {
          {
            "HTMLTidy"              => "html_tidy",
            "HTMLTidyGenerator"     => "html_tidy_generator",
            "FreeBSD"               => "free_bsd",
            "HTML"                  => "html",
            "ForceXMLController"    => "force_xml_controller"
          }
        }
        # https://github.com/rails/rails/blob/
        # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/
        # inflector_test.rb#L98
        let(:one_way_camel_examples) {
          {
            "CamelCase"             => "Camel_Case"
          }
        }
        # added by James
        let(:path_examples) {
          {
            "SomeLib::WithClass"    => "some_lib/with_class"
          }
        }
        # https://github.com/rails/rails/blob/
        # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/
        # inflector_test.rb#L101-L145
        let(:acronym_examples) {
          {
            "API"                   => "api",
            "APIController"         => "api_controller",
            "Nokogiri::HTML"        => "nokogiri/html",
            "HTTPAPI"               => "http_api",
            "HTTP::Get"             => "http/get",
            "SSLError"              => "ssl_error",
            "RESTful"               => "restful",
            "RESTfulController"     => "restful_controller",
            "Nested::RESTful"       => "nested/restful",
            "IHeartW3C"             => "i_heart_w3c",
            "PhDRequired"           => "phd_required",
            "IRoRU"                 => "i_ror_u",
            "RESTfulHTTPAPI"        => "restful_http_api",
    
            # misdirection
            "Capistrano"            => "capistrano",
            "CapiController"        => "capi_controller",
            "HttpsApis"             => "https_apis",
            "Html5"                 => "html5",
            "Restfully"             => "restfully",
            "RoRails"               => "ro_rails"
          }
        }
    
        it "can snake_case a String" do
          examples.each do |camel, snake|
            camel.snake_case.must_equal(snake)
          end
        end
    
        it "can handle some tricky one-way cases for snake_case" do
          one_way_snake_examples.each do |camel, snake|
            camel.snake_case.must_equal(snake)
          end
        end
    
        it "can CamelCase a String" do
          examples.each do |camel, snake|
            snake.CamelCase.must_equal(camel)
          end
        end
    
        it "can handle some tricky one-way cases for CamelCase" do
          one_way_camel_examples.each do |camel, snakey|
            snakey.CamelCase.must_equal(camel)
          end
        end
    
        it "can camelCase a String" do
          "camel_case".camelCase.must_equal("camelCase")
        end
    
        it "can convert nesting to paths and back" do
          path_examples.each do |camel, snake|
            camel.snake_case.must_equal(snake)
            snake.CamelCase.must_equal(camel)
          end
        end
    
        it "is aware of acronyms" do
          acronyms = AcronymManager.new
          acronyms.add("API")
          acronyms.add("HTML")
          acronyms.add("HTTP")
          acronyms.add("RESTful")
          acronyms.add("W3C")
          acronyms.add("PhD")
          acronyms.add("RoR")
          acronyms.add("SSL")
    
          acronym_examples.each do |camel, snake|
            camel.snake_case(acronyms).must_equal(snake)
            snake.CamelCase(acronyms).must_equal(camel)
          end
        end
      end
    end
    

    Read more…

  • 19

    SEP
    2014

    "You can't parse [X]HTML with regex."

    The only explanation I'll give for the following code it to provide this link to my favorite Stack Overflow answer.

    #!/usr/bin/env ruby -w
    
    require "open-uri"
    
    URL    = "http://stackoverflow.com/questions/1732348/" +
             "regex-match-open-tags-except-xhtml-self-contained-tags"
    PARSER = %r{
      (?<doctype_declaration>
        <!DOCTYPE\b (?<doctype> [^>]* ) >
      ){0}
      (?<comment>
        <!-- .* -->
      ){0}
    
      (?<script_tag>
        < \s* (?<tag_name> script ) \s* (?<attributes> [^>]* > )
          (?<script> .*? )
        < \s* / \s* script \s* >
      ){0}
      (?<self_closed_tag>
        < \s* (?<tag_name> \w+ ) \s* (?<attributes> [^>]* / \s* > )
      ){0}
      (?<unclosed_tag>
        < \s*
        (?<tag_name> link | meta | br | input | hr | img ) \b
        \s*
        (?<attributes> [^>]* > )
      ){0}
      (?<open_tag>
        < \s* (?<tag_name> \w+ ) \s* (?<attributes> [^>]* > )
      ){0}
      (?<close_tag>
        < \s* / \s* (?<tag_name> \w+ ) \s* >
      ){0}
    
      (?<attribute>
        (?<attribute_name> [-\w]+ )
        (?: \s* = \s* (?<attribute_value> "[^"]*" | '[^']*' | [^>\s]+ ) )? \s*
      ){0}
      (?<attribute_list>
        \g<attribute>
        (?= [^>]* > \z )  # attributes keep a trailing > to disambiguate from text
      ){0}
    
      (?<text>
        (?! [^<]* /?\s*> \z )  # a guard to prevent this from parsing attributes
        [^<]+
      ){0}
    
      \G
      (?:
        \g<doctype_declaration>
        |
        \g<comment>
        |
        \g<script_tag>
        |
        \g<self_closed_tag>
        |
        \g<unclosed_tag>
        |
        \g<open_tag>
        |
        \g<attribute_list>
        |
        \g<close_tag>
        |
        \g<text>
      )
      \s*
    }mix
    
    def parse(html)
      stack = [{attributes: [ ], contents: [ ], name: :root}]
      loop do
        html.sub!(PARSER, "") or break
        if $~[:doctype_declaration]
          add_to_tree(stack.last, "DOCTYPE", $~[:doctype].strip)
        elsif $~[:script_tag]
          add_to_stack(stack, $~[:tag_name], $~[:attributes], $~[:script])
        elsif $~[:self_closed_tag] || $~[:unclosed_tag] || $~[:open_tag]
          add_to_stack(stack, $~[:tag_name], $~[:attributes], "", $~[:open_tag])
        elsif $~[:close_tag]
          stack.pop
        elsif $~[:text]
          stack.last[:contents] << $~[:text]
        end
      end
      stack.pop
    end
    
    def add_to_tree(branch, name, value)
      if branch.include?(name)
        branch[name]  = [branch[name]] unless branch[name].is_a?(Array)
        branch[name] << value
      else
        branch[name] = value
      end
    end
    
    def add_to_stack(stack, tag_name, attributes_html, contents, open = false)
      tag = { attributes: parse_attributes(attributes_html),
              contents:   [contents].reject(&:empty?),
              name:       tag_name }
      add_to_tree(stack.last, tag_name, tag)
      stack.last[:contents] << tag
      stack                 << tag if open
    end
    
    def parse_attributes(attributes_html)
      attributes = { }
      loop do
        attributes_html.sub!(PARSER, "") or break
        add_to_tree(
          attributes,
          $~[:attribute_name],
          ($~[:attribute_value] || $~[:attribute_name]).sub(/\A(["'])(.*)\1\z/, '\2')
        )
      end
      attributes
    end
    
    def convert_to_bbcode(node)
      if node.is_a?(Hash)
        name = node[:name].sub(/\Astrike\z/, "s")
        "[#{name}]#{node[:contents].map { |c| send(__method__, c) }.join}[/#{name}]"
      else
        node
      end
    end
    
    html = open(URL, &:read).strip
    ast  = parse(html)
    puts ast["html"]["body"]["div"]
      .find { |div| div[:attributes]["class"] == "container"      }["div"]
      .find { |div| div[:attributes]["id"]    == "content"        }["div"]["div"]
      .find { |div| div[:attributes]["id"]    == "mainbar"        }["div"]
      .find { |div| div[:attributes]["id"]    == "answers"        }["div"]
      .find { |div| div[:attributes]["id"]    == "answer-1732454" }["table"]["tr"]
      .first["td"]
      .find { |div| div[:attributes]["class"] == "answercell"     }["div"]["p"]
      .first[:contents]
      .map(&method(:convert_to_bbcode))  # to reach a wider audience
      .join
    

    Read more…

  • 2

    AUG
    2006

    RegexpChallenge

    Just recently I have been working with two different people to improve their regular expression skills. To help me in this endeavor, I built a trivial little script we have been using in IRb. To get started, you construct a new challenge object and add a couple of challenges:

    >> reg_chal = RegexpChallenge.new
    No challenges.
    => 
    >> reg_chal.challenge("Gray, James", "James", "The names can vary.")
    => nil
    >> reg_chal.challenge("abbbbbbbc bc", 10)
    => nil
    >> reg_chal.challenge( "    \n\t  ", nil,
    ?>                     "We want to test for non-space data." )
    => nil
    >> reg_chal.challenge( "cogs 9, widgets 12, ...", "12",
    ?>                     "The numbers can vary." )
    => nil
    >> reg_chal.challenge( "I'm a simple sentence, with words.",
    ?>                     %w[I'm a simple sentence with words] )
    => nil
    

    You can ask for challenges to see what you would like to solve:

    >> reg_chal.challenges
    Challenge #0:
       Input:  "Gray, James"
      Output:  "James"
        Note:  "The names can vary."
    Challenge #1:
       Input:  "abbbbbbbc bc"
      Output:  10
    Challenge #2:
       Input:  "    \n\t  "
      Output:  nil
        Note:  "We want to test for non-space data."
    Challenge #3:
       Input:  "cogs 9, widgets 12, ..."
      Output:  "12"
        Note:  "The numbers can vary."
    Challenge #4:
       Input:  "I'm a simple sentence, with words."
      Output:  ["I'm", "a", "simple", "sentence", "with", "words"]
    => nil
    

    Read more…