Differences

This shows you the differences between two versions of the page.

--- oracle:fixhtml [2010/11/03 13:32] – rlunaro
+++ oracle:fixhtml [2011/03/19 20:37] – external edit 127.0.0.1
@@ Line 1: / Line 1: @@
-====== Fix or Sanitize HTML code from Word ======
-Yes: I've found the silver bullet for those of you who are seeking for a function that clean html code or sanitize it, specially if it comes from a cut and paste operation from word.
-To the point. This snippet:
-<code>
-select dirty, strip_html(dirty) from dual;
-</code>
-Removes all the HTML tags from the html code. But this one:
-<code>
-select dirty, strip_html(dirty,2) from dual;
-</code>
-Wipes out all the garbage who is in the html code, leaving it --more or less-- "clean". And yes, I am using regular expresions to perform the fixing, so it is easy to move it to java or other programming languages. And without more preamble, here is the code:
-<code plsql>
- create or replace function strip_html(dirty in clob,
-                                      to_cvs in number default 0)
-  return clob is out clob ;
-  type arr_string is varray (200) of varchar2(64);
-  entities_search_for arr_string;
-  entities_replace arr_string;
-  cont number;
-begin
--- to accelerate the issue
-if dirty is null then
-   return dirty;
-end if; -- isnull(dirty)
-if length( dirty ) = 0 then
-   return dirty;
-end if; -- length(dirty)
-entities_search_for := arr_string(
-'&excl;',
-'&num;',
-'&dollar;',
-'&percnt;',
-'&amp;',
-'&quot;',
-'&lpar;',
-'&rpar;',
-'&midast;',
-'&plus;',
-'&comma;',
-'&hyphen;',
-'&period;',
-'&sol;',
-'&colon;',
-'&semi;',
-'&lt;',
-'&equals;',
-'&gt;',
-'&quest;',
-'&commat;',
-'&lsqb;',
-'&bsol;',
-'&rsqb;',
-'&circ;',
-'&lowbar;',
-'&grave;',
-'&lcub;',
-'&verbar;',
-'&rcub;',
-'&tilde;',
-'&nbsp;',
-'&iexcl;',
-'&cent;',
-'&pound;',
-'&curren;',
-'&yen;',
-'&brvbar;',
-'&sect;',
-'&Dot;',
-'&copy;',
-'&ordf;',
-'&laquo;',
-'&not;',
-'&shy;',
-'&reg;',
-'&macr;',
-'&deg;',
-'&plusmn;',
-'&sup2;',
-'&sup3;',
-'&acute;',
-'&micro;',
-'&para;',
-'&middot;',
-'&cedil;',
-'&sup1;',
-'&ordm;',
-'&raquo;',
-'&fr;',
-'&fr;',
-'&fr;',
-'&iquest;',
-'&Agrave;',
-'&Aacute;',
-'&Acirc;',
-'&Atilde;',
-'&Auml;',
-'&Aring;',
-'&AElig;',
-'&il;',
-'&Egrave;',
-'&Eacute;',
-'&Ecirc;',
-'&Euml;',
-'&Igrave;',
-'&Iacute;',
-'&Icirc;',
-'&Iuml;',
-'&ETH;',
-'&Ntilde;',
-'&Ograve;',
-'&Oacute;',
-'&Ocirc;',
-'&Otilde;',
-'&Ouml;',
-'&times;',
-'&Oslash;',
-'&Ugrave;',
-'&Uacute;',
-'&Ucirc;',
-'&Uuml;',
-'&Yacute;',
-'&THORN;',
-'&szlig;',
-'&agrave;',
-'&aacute;',
-'&acirc;',
-'&atilde;',
-'&auml;',
-'&egrave;',
-'&eacute;',
-'&ecirc;',
-'&etilde;',
-'&euml;',
-'&igrave;',
-'&iacute;',
-'&icirc;',
-'&itilde;',
-'&iuml;',
-'&ograve;',
-'&oacute;',
-'&ocirc;',
-'&otilde;',
-'&ouml;',
-'&ugrave;',
-'&uacute;',
-'&ucirc;',
-'&utilde;',
-'&uuml;');
-entities_replace := arr_string(
-'¡',
-'º',
-'$',
-'%',
-'&',
-'"',
-'(',
-')',
-'*',
-'+',
-',',
-'-',
-'.',
-'Sol',
-'Colon',
-'*',
-'<',
-'=',
-'>',
-'?',
-',',
-'*',
-'*',
-'*',
-'*',
-'_',
-'''',
-'*',
-'*',
-'*',
-'''',
-' ',
-'¡',
-'cent',
-'L',
-'*',
-'Y',
-'*',
-'*',
-'.',
-'(c)',
-'*',
-'*',
-'!',
-'*',
-'(r)',
-'*',
-'*',
-'*',
-'*',
-'*',
-'á',
-'u',
-'*',
-'·',
-'ç',
-'*',
-'*',
-'*',
-'*',
-'*',
-'*',
-'¿',
-'È',
-'Á',
-'Ä',
-'Á',
-'*',
-'*',
-'AE',
-'*',
-'È',
-'É',
-'*',
-'*',
-'Ì',
-'Í',
-'Î',
-'*',
-'*',
-'N',
-'Ò',
-'Ó',
-'Ô',
-'O',
-'*',
-'*',
-'O',
-'Ù',
-'Ú',
-'Û',
-'*',
-'*',
-'*',
-'*',
-'à',
-'á',
-'â',
-'a',
-'*',
-'è',
-'é',
-'ê',
-'e',
-'*',
-'ì',
-'í',
-'î',
-'i',
-'*',
-'ò',
-'ó',
-'ô',
-'o',
-'*',
-'ù',
-'ú',
-'û',
-'u',
-'*');
-  out := dirty;
-  -- replace what is enclosed between <xml> and </xml>
-  -- *? -> lazy star (catches the minimum possible)
-  out := regexp_replace(out, '<xml>.*?</xml>', '', 1, 0, 'ni' );
-  -- clean what it is inside the style tags
-  out := regexp_replace(out, '<style>.*?</style>', '', 1, 0, 'ni' );
-  if to_cvs = 2 then
-     -- sanitize (not clean) the html
-     -- clean the tag <?xml:whatever>
-     out := regexp_replace(out, '<\?xml:.*?>', '', 1, 0, 'ni');
-     -- clean the tags <img whatever>
-     out := regexp_replace(out, '<img.*?>', '', 1, 0, 'ni');
-     -- clean comments
-     out := regexp_replace(out,'<!--.*?-->','', 1, 0, 'ni');
-     -- clean meta
-     out := regexp_replace(out,'<meta.*?>','', 1, 0, 'ni');
-     -- clean link
-     out := regexp_replace(out,'<link.*?>','', 1, 0, 'ni');
-     -- clean DIV
-     out := regexp_replace(out,'</?div.*?>','', 1, 0, 'ni');
-     -- clean SPAN
-     out := regexp_replace(out,'</?span.*?>','', 1, 0, 'ni');
-     -- clean "class inside tags"
-     out := regexp_replace(out,'(<.*?)class="?[a-zA-Z0-9-_]*"?(.*?>)', '\1\2', 1, 0, 'ni');
-     -- clean "style" inside the following tags: i b p
-     out := regexp_replace(out,'(<[ibp] .*?)style=".*?"(.*?>)', '\1\2', 1, 0, 'ni');
-     -- clean namespaces <o:p> </o:p>
-     out := regexp_replace(out, '(<)[a-zA-Z0-9-_]*:(.*?>)', '\1\2', 1, 0, 'ni');
-     out := regexp_replace(out, '(</)[a-zA-Z0-9-_]*:(.*?>)', '\1\2', 1, 0, 'ni');
-     -- clean empty opening and closing tags: it has to be
-     -- passed twice or three times to clean things like this:
-     -- <strong><u></u></strong>
-     -- TWEAK: <p></p> must be replaced by <br/>
-     out := regexp_replace(out,'<p></p>','<br/>', 1, 0, 'ni');
-     out := regexp_replace(out,'<([a-zA-Z0-9-_]*)></\1>','', 1, 0, 'ni');
-     -- TWEAK: <p></p> must be replaced by <br/>
-     out := regexp_replace(out,'<p></p>','<br/>', 1, 0, 'ni');
-     out := regexp_replace(out,'<([a-zA-Z0-9-_]*)></\1>','', 1, 0, 'ni');
-  else
-    -- clean html
-    -- replace all the stuff that is similar to a carriage return
-    out := regexp_replace(out, '</p[^>]*>',chr(10)||chr(13));
-    out := regexp_replace(out, '</br[^>]*>',chr(10)||chr(13));
-    out := regexp_replace(out, '</tr[^>]*>',chr(10)||chr(13));
-    -- replace all the remaining html stuff
-    out := regexp_replace(out,'<[^>]*>','', 1, 0, 'ni');
-    -- replace all the entities
-    for cont in 1..119 loop
-      out := replace( out, entities_search_for(cont), entities_replace(cont) );
-    end loop;
-    -- cleaning for export to cvs
-    if to_cvs = 1 then
-       out := replace( out, chr(10), '' );
-       out := replace( out, chr(13), '' );
-  	   out := replace( out, chr(9), '' );
-       out := replace( out, ';', ',' );
-       out := replace( out, '"', '''' );
-    end if;
-  end if;
-  return(out);
-end strip_html;
-</code>