User Tools

Site Tools


oracle:fixhtml

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Next revision
Previous revision
oracle:fixhtml [2010/09/13 17:07] – creado rlunarooracle:fixhtml [2022/12/02 22:02] (current) – external edit 127.0.0.1
Line 1: Line 1:
-====== Fix or Sanitize HTML code from Word ======+====== Fix or Sanitize HTML code from Microsoft Word ======
  
 Yes: I've found the silver bullet for those of you who are seeking for a function that clean html code or sanitize it, specially if it comes from a cut and paste operation from word.  Yes: I've found the silver bullet for those of you who are seeking for a function that clean html code or sanitize it, specially if it comes from a cut and paste operation from word. 
Line 17: Line 17:
 Wipes out all the garbage who is in the html code, leaving it --more or less-- "clean". And yes, I am using regular expresions to perform the fixing, so it is easy to move it to java or other programming languages. And without more preamble, here is the code: Wipes out all the garbage who is in the html code, leaving it --more or less-- "clean". And yes, I am using regular expresions to perform the fixing, so it is easy to move it to java or other programming languages. And without more preamble, here is the code:
  
-<code> +<code plsql
-/* + 
-*/ + create or replace function strip_html(dirty in clob,
-create or replace function strip_html(dirty in clob,+
                                       to_cvs in number default 0)                                       to_cvs in number default 0)
   return clob is out clob ;   return clob is out clob ;
Line 296: Line 295:
   if to_cvs = 2 then   if to_cvs = 2 then
      -- sanitize (not clean) the html      -- sanitize (not clean) the html
-     + 
 +     -- clean the tag <?xml:whatever> 
 +     out := regexp_replace(out, '<\?xml:.*?>', '', 1, 0, 'ni'); 
 +     -- clean the tags <img whatever> 
 +     out := regexp_replace(out, '<img.*?>', '', 1, 0, 'ni');
      -- clean comments      -- clean comments
      out := regexp_replace(out,'<!--.*?-->','', 1, 0, 'ni');      out := regexp_replace(out,'<!--.*?-->','', 1, 0, 'ni');
Line 309: Line 312:
      -- clean "class inside tags"      -- clean "class inside tags"
      out := regexp_replace(out,'(<.*?)class="?[a-zA-Z0-9-_]*"?(.*?>)', '\1\2', 1, 0, 'ni');      out := regexp_replace(out,'(<.*?)class="?[a-zA-Z0-9-_]*"?(.*?>)', '\1\2', 1, 0, 'ni');
 +     -- clean "style" inside the following tags: i b p
 +     out := regexp_replace(out,'(<[ibp] .*?)style=".*?"(.*?>)', '\1\2', 1, 0, 'ni');
      -- clean namespaces <o:p> </o:p>      -- clean namespaces <o:p> </o:p>
      out := regexp_replace(out, '(<)[a-zA-Z0-9-_]*:(.*?>)', '\1\2', 1, 0, 'ni');      out := regexp_replace(out, '(<)[a-zA-Z0-9-_]*:(.*?>)', '\1\2', 1, 0, 'ni');
Line 354: Line 359:
   return(out);   return(out);
 end strip_html; end strip_html;
 +
  
  
oracle/fixhtml.1284390424.txt.gz · Last modified: 2022/12/02 22:02 (external edit)