This is an old revision of the document!

Fix or Sanitize HTML code from Word

Yes: I've found the silver bullet for those of you who are seeking for a function that clean html code or sanitize it, specially if it comes from a cut and paste operation from word.

To the point. This snippet:

select dirty, strip_html(dirty) from dual;

Removes all the HTML tags from the html code. But this one:

select dirty, strip_html(dirty,2) from dual;

Wipes out all the garbage who is in the html code, leaving it –more or less– “clean”. And yes, I am using regular expresions to perform the fixing, so it is easy to move it to java or other programming languages. And without more preamble, here is the code:

/*
*/
create or replace function strip_html(dirty in clob,
                                      to_cvs in number default 0)
  return clob is out clob ;
  
  type arr_string is varray (200) of varchar2(64); 
  
  entities_search_for arr_string; 
  entities_replace arr_string;
  cont number; 
  
begin


-- to accelerate the issue
if dirty is null then 
   return dirty; 
end if; -- isnull(dirty)

if length( dirty ) = 0 then 
   return dirty; 
end if; -- length(dirty) 

entities_search_for := arr_string(
'&excl;',
'&num;',
'&dollar;',
'&percnt;',
'&amp;',
'&quot;',
'&lpar;',
'&rpar;',
'&midast;',
'&plus;',
'&comma;',
'&hyphen;',
'&period;',
'&sol;',
'&colon;',
'&semi;',
'&lt;',
'&equals;',
'&gt;',
'&quest;',
'&commat;',
'&lsqb;',
'&bsol;',
'&rsqb;',
'&circ;',
'&lowbar;',
'&grave;',
'&lcub;',
'&verbar;',
'&rcub;',
'&tilde;',
'&nbsp;',
'&iexcl;',
'&cent;',
'&pound;',
'&curren;',
'&yen;',
'&brvbar;',
'&sect;',
'&Dot;',
'&copy;',
'&ordf;',
'&laquo;',
'&not;',
'&shy;',
'&reg;',
'&macr;',
'&deg;',
'&plusmn;',
'&sup2;',
'&sup3;',
'&acute;',
'&micro;',
'&para;',
'&middot;',
'&cedil;',
'&sup1;',
'&ordm;',
'&raquo;',
'&fr;',
'&fr;',
'&fr;',
'&iquest;',
'&Agrave;',
'&Aacute;',
'&Acirc;',
'&Atilde;',
'&Auml;',
'&Aring;',
'&AElig;',
'&il;',
'&Egrave;',
'&Eacute;',
'&Ecirc;',
'&Euml;',
'&Igrave;',
'&Iacute;',
'&Icirc;',
'&Iuml;',
'&ETH;',
'&Ntilde;',
'&Ograve;',
'&Oacute;',
'&Ocirc;',
'&Otilde;',
'&Ouml;',
'&times;',
'&Oslash;',
'&Ugrave;',
'&Uacute;',
'&Ucirc;',
'&Uuml;',
'&Yacute;',
'&THORN;',
'&szlig;',
'&agrave;',
'&aacute;',
'&acirc;',
'&atilde;',
'&auml;',
'&egrave;',
'&eacute;',
'&ecirc;',
'&etilde;',
'&euml;',
'&igrave;',
'&iacute;',
'&icirc;',
'&itilde;',
'&iuml;',
'&ograve;',
'&oacute;',
'&ocirc;',
'&otilde;',
'&ouml;',
'&ugrave;',
'&uacute;',
'&ucirc;',
'&utilde;',
'&uuml;');

entities_replace := arr_string(
'¡',
'º',
'$',
'%',
'&',
'"',
'(',
')',
'*',
'+',
',',
'-',
'.',
'Sol',
'Colon',
'*',
'<',
'=',
'>',
'?',
',',
'*',
'*',
'*',
'*',
'_',
'''',
'*',
'*',
'*',
'''',
' ',
'¡',
'cent',
'L',
'*',
'Y',
'*',
'*',
'.',
'(c)',
'*',
'*',
'!',
'*',
'(r)',
'*',
'*',
'*',
'*',
'*',
'á',
'u',
'*',
'·',
'ç',
'*',
'*',
'*',
'*',
'*',
'*',
'¿',
'È',
'Á',
'Ä',
'Á',
'*',
'*',
'AE',
'*',
'È',
'É',
'*',
'*',
'Ì',
'Í',
'Î',
'*',
'*',
'N',
'Ò',
'Ó',
'Ô',
'O',
'*',
'*',
'O',
'Ù',
'Ú',
'Û',
'*',
'*',
'*',
'*',
'à',
'á',
'â',
'a',
'*',
'è',
'é',
'ê',
'e',
'*',
'ì',
'í',
'î',
'i',
'*',
'ò',
'ó',
'ô',
'o',
'*',
'ù',
'ú',
'û',
'u',
'*'); 

  out := dirty; 

  -- replace what is enclosed between <xml> and </xml>
  -- *? -> lazy star (catches the minimum possible)
  out := regexp_replace(out, '<xml>.*?</xml>', '', 1, 0, 'ni' );
  -- clean what it is inside the style tags  
  out := regexp_replace(out, '<style>.*?</style>', '', 1, 0, 'ni' );

  if to_cvs = 2 then
     -- sanitize (not clean) the html
     
     -- clean comments
     out := regexp_replace(out,'<!--.*?-->','', 1, 0, 'ni');
     -- clean meta
     out := regexp_replace(out,'<meta.*?>','', 1, 0, 'ni');
     -- clean link
     out := regexp_replace(out,'<link.*?>','', 1, 0, 'ni');
     -- clean DIV
     out := regexp_replace(out,'</?div.*?>','', 1, 0, 'ni');
     -- clean SPAN
     out := regexp_replace(out,'</?span.*?>','', 1, 0, 'ni');
     -- clean "class inside tags"
     out := regexp_replace(out,'(<.*?)class="?[a-zA-Z0-9-_]*"?(.*?>)', '\1\2', 1, 0, 'ni');
     -- clean namespaces <o:p> </o:p>
     out := regexp_replace(out, '(<)[a-zA-Z0-9-_]*:(.*?>)', '\1\2', 1, 0, 'ni');
     out := regexp_replace(out, '(</)[a-zA-Z0-9-_]*:(.*?>)', '\1\2', 1, 0, 'ni');

     -- clean empty opening and closing tags: it has to be 
     -- passed twice or three times to clean things like this: 
     -- <strong><u></u></strong>
     -- TWEAK: <p></p> must be replaced by <br/>
     out := regexp_replace(out,'<p></p>','<br/>', 1, 0, 'ni');
     out := regexp_replace(out,'<([a-zA-Z0-9-_]*)></\1>','', 1, 0, 'ni');
     -- TWEAK: <p></p> must be replaced by <br/>
     out := regexp_replace(out,'<p></p>','<br/>', 1, 0, 'ni');
     out := regexp_replace(out,'<([a-zA-Z0-9-_]*)></\1>','', 1, 0, 'ni');

  else
    -- clean html

    -- replace all the stuff that is similar to a carriage return
    out := regexp_replace(out, '</p[^>]*>',chr(10)||chr(13));
    out := regexp_replace(out, '</br[^>]*>',chr(10)||chr(13));
    out := regexp_replace(out, '</tr[^>]*>',chr(10)||chr(13));
    
    -- replace all the remaining html stuff 
    out := regexp_replace(out,'<[^>]*>','', 1, 0, 'ni');
    
    -- replace all the entities
    for cont in 1..119 loop 
      out := replace( out, entities_search_for(cont), entities_replace(cont) );
    end loop; 
  
    -- cleaning for export to cvs
    if to_cvs = 1 then 
       out := replace( out, chr(10), '' ); 
       out := replace( out, chr(13), '' ); 
  	   out := replace( out, chr(9), '' ); 
       out := replace( out, ';', ',' ); 
       out := replace( out, '"', '''' ); 
    end if; 


  end if; 

  
  return(out);
end strip_html;