-
25
SEP
2014Regex Code Equivalency
#!/usr/bin/env ruby -w Name = "Gray, James" !!(Name =~ /\AGray/) # => true Name.start_with?("Gray") # => true !!(Name =~ /James\z/) # => true Name.end_with?("James") # => true !!(Name =~ /Dana/) # => false Name.include?("Dana") # => false !!(Name =~ /\A\z/) # => false Name.empty? # => false !!(Name =~ /\AGray, James\z/) # => true Name == "Gray, James" # => true !!(Name =~ /\A(?:Gray, James|Gray, Dana)\z/) # => true ["Gray, James", "Gray, Dana"].include?(Name) # => true Name =~ /\A\w+/ && $& # => "Gray" Name[/\A\w+/] # => "Gray" Name =~ /\A(\w+),\s*(\w+)\z/ && $2 # => "James" Name[/\A(\w+),\s*(\w+)\z/, 2] # => "James" Name =~ /\A(?<last>\w+),\s*(?<first>\w+)\z/ && $~[:first] # => "James" Name[/\A(?<last>\w+),\s*(?<first>\w+)\z/, :first] # => "James" Name.scan(/^.*\n?/) # => ["Gray, James"] Name.lines # => ["Gray, James"] Name.scan(/./m) # => ["G", "r", "a", "y", ",", " ", "J", "a", "m", "e", "s"] Name.chars # => ["G", "r", "a", "y", ",", " ", "J", "a", "m", "e", "s"] Name.gsub(/[aeiou]/, "") # => "Gry, Jms" Name.delete("aeiou") # => "Gry, Jms" Name.gsub(/[aeiou]/, "X") # => "GrXy, JXmXs" Name.tr("aeiou", "X") # => "GrXy, JXmXs" # For the destructive operations that follow you can drop the `dup()` and # switch `sub()` to `sub!()`, as long as you don't care about the return value. Name.sub(/(?=,)/, " II") # => "Gray II, James" Name.dup.insert(Name.index(","), " II") # => "Gray II, James" Name.sub(/\A/, "Name: ") # => "Name: Gray, James" Name.dup.prepend("Name: ") # => "Name: Gray, James" Name.sub(/\A.*\z/m, "Gray, Dana") # => "Gray, Dana" Name.dup.replace("Gray, Dana") # => "Gray, Dana" Name.sub(/\A.*\z/m, "") # => "" Name.dup.clear # => "" Spacey = "\tsome space\r\n" Spacey.sub(/\A\s+/, "") # => "some space\r\n" Spacey.lstrip # => "some space\r\n" Spacey.sub(/\s+\z/, "") # => "\tsome space" Spacey.rstrip # => "\tsome space" Spacey.sub(/\A\s*(.+?)\s*\z/m, '\1') # => "some space" Spacey.strip # => "some space" Spacey.sub(/(?:\r?\n|\r)\z/m, "") # => "\tsome space" Spacey.chomp # => "\tsome space" Spacey.sub(/(?:\r\n|.)\z/m, "") # => "\tsome space" Spacey.chop # => "\tsome space" Spacey.gsub(/ +/, " ") # => "\tsome space\r\n" Spacey.squeeze(" ") # => "\tsome space\r\n"
-
22
SEP
2014A Regex Can't Match Balanced Parentheses
Can we do math with regular expressions?
#!/usr/bin/env ruby -w def build_preparation_regex(number_regex, ops) %r{ (?<number> #{number_regex} ){0} (?<operator> [#{ops.map(&Regexp.method(:escape)).join}] ){0} (?<term_operator_term> \g<term> \s* \g<operator> \s* \g<term> ){0} (?<term> \g<number> | \( \s* \g<term_operator_term> \s* \) ){0} \g<term_operator_term>(?=\s*\z|[^)]) }x end NUMBER_REGEX = %r{ -? # an optional minus \d+ # an integer (?: \. \d+)? # an optional fractional bit }x PREPARE_MULT_AND_DIV_REGEX = build_preparation_regex(NUMBER_REGEX, %w[* /]) PREPARE_ADD_AND_SUB_REGEX = build_preparation_regex(NUMBER_REGEX, %w[* / + -]) CHECK_REGEX = %r{ \A # the start of the expression (?<term> # a term, which is: #{NUMBER_REGEX} # a number | # or \( \s* # a parenthesized group of \g<term> # a term \s* [*/+\-] \s* # an operator \g<term> # and another term \s* \) # the end of the parenthesized group ) \z # the end of the expression }x MATH_REGEX = %r{ \( \s* (?<left> #{NUMBER_REGEX} ) \s* (?<operator> [*/+\-] ) \s* (?<right> #{NUMBER_REGEX} ) \s* \) }x verbose = ARGV.delete("-v") problem = ARGV.first.strip or abort "USAGE: #{$PROGRAM_NAME} MATH_EXPRESSION" steps = [ ] [PREPARE_MULT_AND_DIV_REGEX, PREPARE_ADD_AND_SUB_REGEX].each do |preparation| loop do steps << problem.dup if verbose problem.sub!(preparation) { |term| "(#{term})" } or break end end problem =~ CHECK_REGEX or abort "Error: Invalid expression" solution = problem.dup loop do steps << solution.dup if verbose solution.sub!(MATH_REGEX) { $~[:left].to_f.public_send($~[:operator], $~[:right].to_f) } or break end puts steps.uniq[0..-2] if verbose puts solution.sub(/\.0+\z/, "")
-
20
SEP
2014Can You snake_case/CamelCase With One Regex?
In Rails, methods like
underscore()
andcamelize()
use several regexen to transform theString
under the hood. Many people have asked if you can do it with a single regex though. These specs I borrowed from Rails seem to say yes:#!/usr/bin/env ruby -w class String def snake_case(acronyms = self.class.acronyms) gsub( %r{ (?: (?<before> \b | [A-Za-z\d] ) (?<acronym> #{acronyms.regex} ) (?<after> \b | [^a-z] ) ) | (?: (?<before> [A-Z]+ ) (?<after> [A-Z][^A-Z] ) ) | (?: (?<before> [^A-Z:] ) (?<after> [A-Z] ) ) | (?<nesting> :: ) }x ) { |m| if $~[:nesting] "/" else [$~[:before], $~[:acronym], $~[:after]] .compact .reject(&:empty?) .join("_") end }.downcase end def CamelCase(acronyms = self.class.acronyms) gsub( %r{ (?: (?: \A | _ | (?<nesting> / ) ) (?<acronym> #{acronyms.inverted_regex} ) (?= \b | [A-Z_] ) ) | (?: (?: \A | _ ) (?<letter> . ) ) | (?: (?<nesting> / ) (?<letter> . ) ) }mx ) { nested = $~[:nesting] && "::" capitalized = acronyms.capitalize($~[:acronym]) { $~[:letter].upcase } "#{nested}#{capitalized}" } end def camelCase self.CamelCase.sub(/\A[A-Z]/) { |first_char| first_char.downcase } end def self.acronyms @acronyms ||= AcronymManager.new end end class AcronymManager NEVER_MATCHES = /\zA/ def initialize @acronyms = { } @inverted = { } end attr_reader :acronyms, :inverted private :acronyms, :inverted def add(acronym) acronyms[acronym] = acronym.downcase @inverted = acronyms.invert end def regex return NEVER_MATCHES if acronyms.empty? /(?:#{acronyms.keys.map(&Regexp.method(:escape)).join('|')})/ end def inverted_regex return NEVER_MATCHES if acronyms.empty? /(?:#{inverted.keys.map(&Regexp.method(:escape)).join('|')})/ end def capitalize(acronym, &default) inverted.fetch(acronym, &default) end end if $PROGRAM_NAME == __FILE__ require "minitest/autorun" describe "Case changing" do # https://github.com/rails/rails/blob/ # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/ # inflector_test_cases.rb#L118-L123 let(:examples) { { "Product" => "product", "SpecialGuest" => "special_guest", "ApplicationController" => "application_controller", "Area51Controller" => "area51_controller", } } # https://github.com/rails/rails/blob/ # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/ # inflector_test_cases.rb#L139-L145 let(:one_way_snake_examples) { { "HTMLTidy" => "html_tidy", "HTMLTidyGenerator" => "html_tidy_generator", "FreeBSD" => "free_bsd", "HTML" => "html", "ForceXMLController" => "force_xml_controller" } } # https://github.com/rails/rails/blob/ # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/ # inflector_test.rb#L98 let(:one_way_camel_examples) { { "CamelCase" => "Camel_Case" } } # added by James let(:path_examples) { { "SomeLib::WithClass" => "some_lib/with_class" } } # https://github.com/rails/rails/blob/ # 620f4a4fc962c863b91a51876ffdf58f33bedb9c/activesupport/test/ # inflector_test.rb#L101-L145 let(:acronym_examples) { { "API" => "api", "APIController" => "api_controller", "Nokogiri::HTML" => "nokogiri/html", "HTTPAPI" => "http_api", "HTTP::Get" => "http/get", "SSLError" => "ssl_error", "RESTful" => "restful", "RESTfulController" => "restful_controller", "Nested::RESTful" => "nested/restful", "IHeartW3C" => "i_heart_w3c", "PhDRequired" => "phd_required", "IRoRU" => "i_ror_u", "RESTfulHTTPAPI" => "restful_http_api", # misdirection "Capistrano" => "capistrano", "CapiController" => "capi_controller", "HttpsApis" => "https_apis", "Html5" => "html5", "Restfully" => "restfully", "RoRails" => "ro_rails" } } it "can snake_case a String" do examples.each do |camel, snake| camel.snake_case.must_equal(snake) end end it "can handle some tricky one-way cases for snake_case" do one_way_snake_examples.each do |camel, snake| camel.snake_case.must_equal(snake) end end it "can CamelCase a String" do examples.each do |camel, snake| snake.CamelCase.must_equal(camel) end end it "can handle some tricky one-way cases for CamelCase" do one_way_camel_examples.each do |camel, snakey| snakey.CamelCase.must_equal(camel) end end it "can camelCase a String" do "camel_case".camelCase.must_equal("camelCase") end it "can convert nesting to paths and back" do path_examples.each do |camel, snake| camel.snake_case.must_equal(snake) snake.CamelCase.must_equal(camel) end end it "is aware of acronyms" do acronyms = AcronymManager.new acronyms.add("API") acronyms.add("HTML") acronyms.add("HTTP") acronyms.add("RESTful") acronyms.add("W3C") acronyms.add("PhD") acronyms.add("RoR") acronyms.add("SSL") acronym_examples.each do |camel, snake| camel.snake_case(acronyms).must_equal(snake) snake.CamelCase(acronyms).must_equal(camel) end end end end
-
19
SEP
2014"You can't parse [X]HTML with regex."
The only explanation I'll give for the following code it to provide this link to my favorite Stack Overflow answer.
#!/usr/bin/env ruby -w require "open-uri" URL = "http://stackoverflow.com/questions/1732348/" + "regex-match-open-tags-except-xhtml-self-contained-tags" PARSER = %r{ (?<doctype_declaration> <!DOCTYPE\b (?<doctype> [^>]* ) > ){0} (?<comment> <!-- .* --> ){0} (?<script_tag> < \s* (?<tag_name> script ) \s* (?<attributes> [^>]* > ) (?<script> .*? ) < \s* / \s* script \s* > ){0} (?<self_closed_tag> < \s* (?<tag_name> \w+ ) \s* (?<attributes> [^>]* / \s* > ) ){0} (?<unclosed_tag> < \s* (?<tag_name> link | meta | br | input | hr | img ) \b \s* (?<attributes> [^>]* > ) ){0} (?<open_tag> < \s* (?<tag_name> \w+ ) \s* (?<attributes> [^>]* > ) ){0} (?<close_tag> < \s* / \s* (?<tag_name> \w+ ) \s* > ){0} (?<attribute> (?<attribute_name> [-\w]+ ) (?: \s* = \s* (?<attribute_value> "[^"]*" | '[^']*' | [^>\s]+ ) )? \s* ){0} (?<attribute_list> \g<attribute> (?= [^>]* > \z ) # attributes keep a trailing > to disambiguate from text ){0} (?<text> (?! [^<]* /?\s*> \z ) # a guard to prevent this from parsing attributes [^<]+ ){0} \G (?: \g<doctype_declaration> | \g<comment> | \g<script_tag> | \g<self_closed_tag> | \g<unclosed_tag> | \g<open_tag> | \g<attribute_list> | \g<close_tag> | \g<text> ) \s* }mix def parse(html) stack = [{attributes: [ ], contents: [ ], name: :root}] loop do html.sub!(PARSER, "") or break if $~[:doctype_declaration] add_to_tree(stack.last, "DOCTYPE", $~[:doctype].strip) elsif $~[:script_tag] add_to_stack(stack, $~[:tag_name], $~[:attributes], $~[:script]) elsif $~[:self_closed_tag] || $~[:unclosed_tag] || $~[:open_tag] add_to_stack(stack, $~[:tag_name], $~[:attributes], "", $~[:open_tag]) elsif $~[:close_tag] stack.pop elsif $~[:text] stack.last[:contents] << $~[:text] end end stack.pop end def add_to_tree(branch, name, value) if branch.include?(name) branch[name] = [branch[name]] unless branch[name].is_a?(Array) branch[name] << value else branch[name] = value end end def add_to_stack(stack, tag_name, attributes_html, contents, open = false) tag = { attributes: parse_attributes(attributes_html), contents: [contents].reject(&:empty?), name: tag_name } add_to_tree(stack.last, tag_name, tag) stack.last[:contents] << tag stack << tag if open end def parse_attributes(attributes_html) attributes = { } loop do attributes_html.sub!(PARSER, "") or break add_to_tree( attributes, $~[:attribute_name], ($~[:attribute_value] || $~[:attribute_name]).sub(/\A(["'])(.*)\1\z/, '\2') ) end attributes end def convert_to_bbcode(node) if node.is_a?(Hash) name = node[:name].sub(/\Astrike\z/, "s") "[#{name}]#{node[:contents].map { |c| send(__method__, c) }.join}[/#{name}]" else node end end html = open(URL, &:read).strip ast = parse(html) puts ast["html"]["body"]["div"] .find { |div| div[:attributes]["class"] == "container" }["div"] .find { |div| div[:attributes]["id"] == "content" }["div"]["div"] .find { |div| div[:attributes]["id"] == "mainbar" }["div"] .find { |div| div[:attributes]["id"] == "answers" }["div"] .find { |div| div[:attributes]["id"] == "answer-1732454" }["table"]["tr"] .first["td"] .find { |div| div[:attributes]["class"] == "answercell" }["div"]["p"] .first[:contents] .map(&method(:convert_to_bbcode)) # to reach a wider audience .join