Use tidy binary when available instead of the DL binding of libtidy.so (the
latter is broken by DL security fixes in Ruby 1.8.7-p72, see
http://bugs.debian.org/500461).

diff --exclude=debian -urN samizdat-0.6.1/lib/samizdat/models/content.rb samizdat-0.6.1-tidy/lib/samizdat/models/content.rb
--- samizdat-0.6.1/lib/samizdat/models/content.rb	2009-02-02 16:15:15.000000000 +0200
+++ samizdat-0.6.1-tidy/lib/samizdat/models/content.rb	2009-02-02 16:22:10.000000000 +0200
@@ -238,7 +238,11 @@
           } + "</p>\n"
         }.join
       end
-    Samizdat::Sanitize.new(config.xhtml).sanitize(html)
+    begin
+      Samizdat::Sanitize.new(config.xhtml).sanitize(html)
+    rescue Samizdat::SanitizeError => e
+      raise UserError, CGI.escapeHTML(e.message).untaint
+    end
   end
 
   private
diff --exclude=debian -urN samizdat-0.6.1/lib/samizdat/sanitize.rb samizdat-0.6.1-tidy/lib/samizdat/sanitize.rb
--- samizdat-0.6.1/lib/samizdat/sanitize.rb	2009-02-02 16:15:15.000000000 +0200
+++ samizdat-0.6.1-tidy/lib/samizdat/sanitize.rb	2009-02-02 16:22:30.000000000 +0200
@@ -8,12 +8,8 @@
 #
 # vim: et sw=2 sts=2 ts=8 tw=0
 
-require 'cgi'
-require 'yaml'
 require 'rexml/document'
 
-require 'tidy'
-
 # use (") instead of (') in XML attributes, escape both of them
 #
 module REXML
@@ -29,14 +25,13 @@
 
 module Samizdat
 
-class Sanitize
-  TIDY_PATH = '/usr/lib/libtidy.so'
+class SanitizeError < RuntimeError; end
 
+class Sanitize
   begin
-
     FORMATTER = REXML::Formatters::Default.new(true)   # enable IE hack
 
-  rescue LoadError
+  rescue LoadError, NameError
 
     # backwards compatibility for Ruby versions without REXML::Formatters
     #
@@ -50,13 +45,15 @@
     FORMATTER = LegacyFormatter.new
   end
 
-  def initialize(xhtml, tidypath=TIDY_PATH)
+  # _xhtml_ is expected to be loaded from xhtml.yaml.
+  #
+  # _tidypath_ may point to a binary or library. If it's a library (detected by
+  # .so in the file name), Ruby/Tidy DL-based wrapper library will be used. If
+  # it's a binary, pipe will be used to filter HTML through it.
+  #
+  def initialize(xhtml, tidypath=nil)
     @xhtml = xhtml
-
-    # workaround for memory leak in Tidy.path=
-    if not defined?(@@tidypath) or tidypath != @@tidypath
-      Tidy.path = @@tidypath = tidypath
-    end
+    set_tidy(tidypath)
   end
 
   attr_reader :xhtml
@@ -103,35 +100,113 @@
   # filter HTML through Tidy
   #
   def tidy(html)
-    xml = Tidy.open(:output_xhtml => true, :literal_attributes => true,
-      :tidy_mark => false, :wrap => 0, :char_encoding => 'utf8'
-    ) {|tidy| tidy.clean(html.to_s.untaint) }
-
-    xml.taint
+    @tidy_binary ? tidy_pipe(html) : tidy_dl(html)
   end
 
   # return sanitized HTML
   #
-  def sanitize(html, fragment=true, filter=@xhtml)
+  def sanitize(html, filter=@xhtml)
+    html = tidy(html)
+    (html.nil? or html.empty?) and raise SanitizeError,
+      "Invalid HTML detected"
+
     begin
-      xml = REXML::Document.new(tidy(html)).root
-      xml = xml.elements['//html/body'] if fragment   # work around tidy
+      xml = REXML::Document.new(html).root
+      xml = xml.elements['//html/body']
     rescue REXML::ParseException
-      raise RuntimeError, "Invalid HTML detected: " +
-        CGI.escapeHTML($!.continued_exception.to_s.gsub!(/\n.*/, ''))
+      raise SanitizeError, "Invalid XHTML detected: " +
+        $!.continued_exception.to_s.gsub(/\n.*/, '')
     end
 
     sanitize_element(xml, filter)
 
     html = ''
-    if fragment
-      xml.each {|child| FORMATTER.write(child, html) }
-    else
-      FORMATTER.write(xml, html)
-    end
+    xml.each {|child| FORMATTER.write(child, html) }
 
     html
   end
+
+  private
+
+  SO_PATH_PATTERN = Regexp.new(/\.so(?:\..+)?\z/).freeze
+
+  def is_so?(path)
+    path =~ SO_PATH_PATTERN and File.readable?(path)
+  end
+
+  def set_tidy(tidypath)
+    if tidypath.nil?
+      [ '/usr/bin/tidy',
+        '/usr/local/bin/tidy',
+        '/usr/lib/libtidy.so',
+        '/usr/local/lib/libtidy.so'
+      ].each {|path|
+        if File.exists?(path)
+          tidypath = path
+          break
+        end
+      }
+    end
+
+    if is_so?(tidypath)
+      require 'tidy'
+
+      # workaround for memory leak in Tidy.path=
+      if not defined?(@@tidysopath) or tidypath != @@tidysopath
+        Tidy.path = @@tidysopath = tidypath
+      end
+
+      @tidy_binary = nil
+
+    elsif File.executable?(tidypath)
+      @tidy_binary = tidypath
+    end
+
+    require 'open3' if @tidy_binary
+  end
+
+  def tidy_dl(html)
+    xml = Tidy.open(:quiet => true,
+                    :show_warnings => false,
+                    :show_errors => 1,
+                    :output_xhtml => true,
+                    :literal_attributes => true,
+                    :preserve_entities => true,
+                    :tidy_mark => false,
+                    :wrap => 0,
+                    :char_encoding => 'utf8'
+    ) {|tidy| tidy.clean(html.to_s.untaint) }
+
+    xml.taint
+  end
+
+  def tidy_pipe(html)
+    stdin, stdout, stderr =
+      Open3.popen3(@tidy_binary +
+                   ' --quiet yes' +
+                   ' --show-warnings no' +
+                   ' --show-errors 1' +
+                   ' --output-xhtml yes' +
+                   ' --literal-attributes yes' +
+                   ' --preserve-entities yes' +
+                   ' --tidy-mark no' +
+                   ' --wrap 0' +
+                   ' --char-encoding utf8')
+
+    stdin.write(html.to_s.untaint)
+    stdin.close
+
+    errors = stderr.read
+    stderr.close
+
+    xhtml = stdout.read
+    stdout.close
+
+    errors.nil? or errors.empty? or raise SanitizeError,
+      "Invalid HTML detected: " + errors
+
+    xhtml
+  end
 end
 
 end   # module Samizdat
