Differences

This shows you the differences between two versions of the page.

@@ Line 1: / Line 1: @@
+====== Fix or Sanitize HTML code from Microsoft Word ======
+Yes: I've found the silver bullet for those of you who are seeking for a function that clean html code or sanitize it, specially if it comes from a cut and paste operation from word.
+To the point. This snippet:
+<code>
+select dirty, strip_html(dirty) from dual;
+</code>
+Removes all the HTML tags from the html code. But this one:
+<code>
+select dirty, strip_html(dirty,2) from dual;
+</code>
+Wipes out all the garbage who is in the html code, leaving it --more or less-- "clean". And yes, I am using regular expresions to perform the fixing, so it is easy to move it to java or other programming languages. And without more preamble, here is the code:
+<code plsql>
+ create or replace function strip_html(dirty in clob,
+                                      to_cvs in number default 0)
+  return clob is out clob ;
+  type arr_string is varray (200) of varchar2(64);
+  entities_search_for arr_string;
+  entities_replace arr_string;
+  cont number;
+begin
+-- to accelerate the issue
+if dirty is null then
+   return dirty;
+end if; -- isnull(dirty)
+if length( dirty ) = 0 then
+   return dirty;
+end if; -- length(dirty)
+entities_search_for := arr_string(
+'&excl;',
+'&num;',
+'&dollar;',
+'&percnt;',
+'&amp;',
+'&quot;',
+'&lpar;',
+'&rpar;',
+'&midast;',
+'&plus;',
+'&comma;',
+'&hyphen;',
+'&period;',
+'&sol;',
+'&colon;',
+'&semi;',
+'&lt;',
+'&equals;',
+'&gt;',
+'&quest;',
+'&commat;',
+'&lsqb;',
+'&bsol;',
+'&rsqb;',
+'&circ;',
+'&lowbar;',
+'&grave;',
+'&lcub;',
+'&verbar;',
+'&rcub;',
+'&tilde;',
+'&nbsp;',
+'&iexcl;',
+'&cent;',
+'&pound;',
+'&curren;',
+'&yen;',
+'&brvbar;',
+'&sect;',
+'&Dot;',
+'&copy;',
+'&ordf;',
+'&laquo;',
+'&not;',
+'&shy;',
+'&reg;',
+'&macr;',
+'&deg;',
+'&plusmn;',
+'&sup2;',
+'&sup3;',
+'&acute;',
+'&micro;',
+'&para;',
+'&middot;',
+'&cedil;',
+'&sup1;',
+'&ordm;',
+'&raquo;',
+'&fr;',
+'&fr;',
+'&fr;',
+'&iquest;',
+'&Agrave;',
+'&Aacute;',
+'&Acirc;',
+'&Atilde;',
+'&Auml;',
+'&Aring;',
+'&AElig;',
+'&il;',
+'&Egrave;',
+'&Eacute;',
+'&Ecirc;',
+'&Euml;',
+'&Igrave;',
+'&Iacute;',
+'&Icirc;',
+'&Iuml;',
+'&ETH;',
+'&Ntilde;',
+'&Ograve;',
+'&Oacute;',
+'&Ocirc;',
+'&Otilde;',
+'&Ouml;',
+'&times;',
+'&Oslash;',
+'&Ugrave;',
+'&Uacute;',
+'&Ucirc;',
+'&Uuml;',
+'&Yacute;',
+'&THORN;',
+'&szlig;',
+'&agrave;',
+'&aacute;',
+'&acirc;',
+'&atilde;',
+'&auml;',
+'&egrave;',
+'&eacute;',
+'&ecirc;',
+'&etilde;',
+'&euml;',
+'&igrave;',
+'&iacute;',
+'&icirc;',
+'&itilde;',
+'&iuml;',
+'&ograve;',
+'&oacute;',
+'&ocirc;',
+'&otilde;',
+'&ouml;',
+'&ugrave;',
+'&uacute;',
+'&ucirc;',
+'&utilde;',
+'&uuml;');
+entities_replace := arr_string(
+'¡',
+'º',
+'$',
+'%',
+'&',
+'"',
+'(',
+')',
+'*',
+'+',
+',',
+'-',
+'.',
+'Sol',
+'Colon',
+'*',
+'<',
+'=',
+'>',
+'?',
+',',
+'*',
+'*',
+'*',
+'*',
+'_',
+'''',
+'*',
+'*',
+'*',
+'''',
+' ',
+'¡',
+'cent',
+'L',
+'*',
+'Y',
+'*',
+'*',
+'.',
+'(c)',
+'*',
+'*',
+'!',
+'*',
+'(r)',
+'*',
+'*',
+'*',
+'*',
+'*',
+'á',
+'u',
+'*',
+'·',
+'ç',
+'*',
+'*',
+'*',
+'*',
+'*',
+'*',
+'¿',
+'È',
+'Á',
+'Ä',
+'Á',
+'*',
+'*',
+'AE',
+'*',
+'È',
+'É',
+'*',
+'*',
+'Ì',
+'Í',
+'Î',
+'*',
+'*',
+'N',
+'Ò',
+'Ó',
+'Ô',
+'O',
+'*',
+'*',
+'O',
+'Ù',
+'Ú',
+'Û',
+'*',
+'*',
+'*',
+'*',
+'à',
+'á',
+'â',
+'a',
+'*',
+'è',
+'é',
+'ê',
+'e',
+'*',
+'ì',
+'í',
+'î',
+'i',
+'*',
+'ò',
+'ó',
+'ô',
+'o',
+'*',
+'ù',
+'ú',
+'û',
+'u',
+'*');
+  out := dirty;
+  -- replace what is enclosed between <xml> and </xml>
+  -- *? -> lazy star (catches the minimum possible)
+  out := regexp_replace(out, '<xml>.*?</xml>', '', 1, 0, 'ni' );
+  -- clean what it is inside the style tags
+  out := regexp_replace(out, '<style>.*?</style>', '', 1, 0, 'ni' );
+  if to_cvs = 2 then
+     -- sanitize (not clean) the html
+     -- clean the tag <?xml:whatever>
+     out := regexp_replace(out, '<\?xml:.*?>', '', 1, 0, 'ni');
+     -- clean the tags <img whatever>
+     out := regexp_replace(out, '<img.*?>', '', 1, 0, 'ni');
+     -- clean comments
+     out := regexp_replace(out,'<!--.*?-->','', 1, 0, 'ni');
+     -- clean meta
+     out := regexp_replace(out,'<meta.*?>','', 1, 0, 'ni');
+     -- clean link
+     out := regexp_replace(out,'<link.*?>','', 1, 0, 'ni');
+     -- clean DIV
+     out := regexp_replace(out,'</?div.*?>','', 1, 0, 'ni');
+     -- clean SPAN
+     out := regexp_replace(out,'</?span.*?>','', 1, 0, 'ni');
+     -- clean "class inside tags"
+     out := regexp_replace(out,'(<.*?)class="?[a-zA-Z0-9-_]*"?(.*?>)', '\1\2', 1, 0, 'ni');
+     -- clean "style" inside the following tags: i b p
+     out := regexp_replace(out,'(<[ibp] .*?)style=".*?"(.*?>)', '\1\2', 1, 0, 'ni');
+     -- clean namespaces <o:p> </o:p>
+     out := regexp_replace(out, '(<)[a-zA-Z0-9-_]*:(.*?>)', '\1\2', 1, 0, 'ni');
+     out := regexp_replace(out, '(</)[a-zA-Z0-9-_]*:(.*?>)', '\1\2', 1, 0, 'ni');
+     -- clean empty opening and closing tags: it has to be
+     -- passed twice or three times to clean things like this:
+     -- <strong><u></u></strong>
+     -- TWEAK: <p></p> must be replaced by <br/>
+     out := regexp_replace(out,'<p></p>','<br/>', 1, 0, 'ni');
+     out := regexp_replace(out,'<([a-zA-Z0-9-_]*)></\1>','', 1, 0, 'ni');
+     -- TWEAK: <p></p> must be replaced by <br/>
+     out := regexp_replace(out,'<p></p>','<br/>', 1, 0, 'ni');
+     out := regexp_replace(out,'<([a-zA-Z0-9-_]*)></\1>','', 1, 0, 'ni');
+  else
+    -- clean html
+    -- replace all the stuff that is similar to a carriage return
+    out := regexp_replace(out, '</p[^>]*>',chr(10)||chr(13));
+    out := regexp_replace(out, '</br[^>]*>',chr(10)||chr(13));
+    out := regexp_replace(out, '</tr[^>]*>',chr(10)||chr(13));
+    -- replace all the remaining html stuff
+    out := regexp_replace(out,'<[^>]*>','', 1, 0, 'ni');
+    -- replace all the entities
+    for cont in 1..119 loop
+      out := replace( out, entities_search_for(cont), entities_replace(cont) );
+    end loop;
+    -- cleaning for export to cvs
+    if to_cvs = 1 then
+       out := replace( out, chr(10), '' );
+       out := replace( out, chr(13), '' );
+  	   out := replace( out, chr(9), '' );
+       out := replace( out, ';', ',' );
+       out := replace( out, '"', '''' );
+    end if;
+  end if;
+  return(out);
+end strip_html;
+</code>