Deadly Regular Expressions

What can we learn by using regular expression to do what it cannot do?

20

SEP
2014

Can You snake_case/CamelCase With One Regex?

In Rails, methods like underscore() and camelize() use several regexen to transform the String under the hood. Many people have asked if you can do it with a single regex though. These specs I borrowed from Rails seem to say yes:

#!/usr/bin/env ruby -w

class String
  def snake_case(acronyms = self.class.acronyms)
    gsub( %r{
      (?:
        (?<before>  \b | [A-Za-z\d]   )
        (?<acronym> #{acronyms.regex} )
        (?<after>   \b | [^a-z]       )
      )
      |
      (?: (?<before> [A-Z]+ ) (?<after> [A-Z][^A-Z] ) )
      |
      (?: (?<before> [^A-Z:] ) (?<after> [A-Z] ) )
      |
      (?<nesting> :: )
    }x ) { |m|
      if $~[:nesting]
        "/"
      else
        [$~[:before], $~[:acronym], $~[:after]]
          .compact
          .reject(&:empty?)
          .join("_")
      end
    }.downcase
  end

  def CamelCase(acronyms = self.class.acronyms)
    gsub( %r{
      (?:
        (?: \A | _ | (?<nesting> / ) )
        (?<acronym> #{acronyms.inverted_regex} )
        (?= \b | [A-Z_] )
      )
      |
      (?: (?: \A | _ ) (?<letter> . ) )
      |
      (?: (?<nesting> / ) (?<letter> . ) )
    }mx ) {
      nested      = $~[:nesting] && "::"
      capitalized = acronyms.capitalize($~[:acronym]) { $~[:letter].upcase }
      "#{nested}#{capitalized}"
    }
  end

  def camelCase
    self.CamelCase.sub(/\A[A-Z]/) { |first_char| first_char.downcase }
  end

  def self.acronyms
    @acronyms ||= AcronymManager.new
  end
end

class AcronymManager
  NEVER_MATCHES = /\zA/

  def initialize
    @acronyms = { }
    @inverted = { }
  end

  attr_reader :acronyms, :inverted
  private     :acronyms, :inverted

  def add(acronym)
    acronyms[acronym] = acronym.downcase
    @inverted         = acronyms.invert
  end

  def regex
    return NEVER_MATCHES if acronyms.empty?

    /(?:#{acronyms.keys.map(&Regexp.method(:escape)).join('|')})/
  end

  def inverted_regex
    return NEVER_MATCHES if acronyms.empty?

    /(?:#{inverted.keys.map(&Regexp.method(:escape)).join('|')})/
  end

  def capitalize(acronym, &default)
    inverted.fetch(acronym, &default)
  end
end

if $PROGRAM_NAME == __FILE__
  require "minitest/autorun"

  describe "Case changing" do
    # https://github.com/rails/rails/blob/
    # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/
    # inflector_test_cases.rb#L118-L123
    let(:examples) {
      {
        "Product"               => "product",
        "SpecialGuest"          => "special_guest",
        "ApplicationController" => "application_controller",
        "Area51Controller"      => "area51_controller",
      }
    }
    # https://github.com/rails/rails/blob/
    # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/
    # inflector_test_cases.rb#L139-L145
    let(:one_way_snake_examples) {
      {
        "HTMLTidy"              => "html_tidy",
        "HTMLTidyGenerator"     => "html_tidy_generator",
        "FreeBSD"               => "free_bsd",
        "HTML"                  => "html",
        "ForceXMLController"    => "force_xml_controller"
      }
    }
    # https://github.com/rails/rails/blob/
    # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/
    # inflector_test.rb#L98
    let(:one_way_camel_examples) {
      {
        "CamelCase"             => "Camel_Case"
      }
    }
    # added by James
    let(:path_examples) {
      {
        "SomeLib::WithClass"    => "some_lib/with_class"
      }
    }
    # https://github.com/rails/rails/blob/
    # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/
    # inflector_test.rb#L101-L145
    let(:acronym_examples) {
      {
        "API"                   => "api",
        "APIController"         => "api_controller",
        "Nokogiri::HTML"        => "nokogiri/html",
        "HTTPAPI"               => "http_api",
        "HTTP::Get"             => "http/get",
        "SSLError"              => "ssl_error",
        "RESTful"               => "restful",
        "RESTfulController"     => "restful_controller",
        "Nested::RESTful"       => "nested/restful",
        "IHeartW3C"             => "i_heart_w3c",
        "PhDRequired"           => "phd_required",
        "IRoRU"                 => "i_ror_u",
        "RESTfulHTTPAPI"        => "restful_http_api",

        # misdirection
        "Capistrano"            => "capistrano",
        "CapiController"        => "capi_controller",
        "HttpsApis"             => "https_apis",
        "Html5"                 => "html5",
        "Restfully"             => "restfully",
        "RoRails"               => "ro_rails"
      }
    }

    it "can snake_case a String" do
      examples.each do |camel, snake|
        camel.snake_case.must_equal(snake)
      end
    end

    it "can handle some tricky one-way cases for snake_case" do
      one_way_snake_examples.each do |camel, snake|
        camel.snake_case.must_equal(snake)
      end
    end

    it "can CamelCase a String" do
      examples.each do |camel, snake|
        snake.CamelCase.must_equal(camel)
      end
    end

    it "can handle some tricky one-way cases for CamelCase" do
      one_way_camel_examples.each do |camel, snakey|
        snakey.CamelCase.must_equal(camel)
      end
    end

    it "can camelCase a String" do
      "camel_case".camelCase.must_equal("camelCase")
    end

    it "can convert nesting to paths and back" do
      path_examples.each do |camel, snake|
        camel.snake_case.must_equal(snake)
        snake.CamelCase.must_equal(camel)
      end
    end

    it "is aware of acronyms" do
      acronyms = AcronymManager.new
      acronyms.add("API")
      acronyms.add("HTML")
      acronyms.add("HTTP")
      acronyms.add("RESTful")
      acronyms.add("W3C")
      acronyms.add("PhD")
      acronyms.add("RoR")
      acronyms.add("SSL")

      acronym_examples.each do |camel, snake|
        camel.snake_case(acronyms).must_equal(snake)
        snake.CamelCase(acronyms).must_equal(camel)
      end
    end
  end
end

Rails handles a few edge cases, like acronyms and converting paths to class hierarchies. If you don't need all of that fanciness, the expressions used can get a lot simpler.

Comments (0)
Leave a Comment (using GitHub Flavored Markdown)

Comments on this blog are moderated. Spam is removed, formatting is fixed, and there's a zero tolerance policy on intolerance.

Ajax loader