User Tools

Site Tools


oracle:fixhtml

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revision
Previous revision
oracle:fixhtml [2011/03/19 20:37] – external edit 127.0.0.1oracle:fixhtml [2022/12/02 22:02] (current) – external edit 127.0.0.1
Line 1: Line 1:
 +====== Fix or Sanitize HTML code from Microsoft Word ======
 +
 +Yes: I've found the silver bullet for those of you who are seeking for a function that clean html code or sanitize it, specially if it comes from a cut and paste operation from word. 
 +
 +To the point. This snippet: 
 +
 +<code>
 +select dirty, strip_html(dirty) from dual; 
 +</code>
 +
 +Removes all the HTML tags from the html code. But this one: 
 +
 +<code>
 +select dirty, strip_html(dirty,2) from dual; 
 +</code>
 +
 +Wipes out all the garbage who is in the html code, leaving it --more or less-- "clean". And yes, I am using regular expresions to perform the fixing, so it is easy to move it to java or other programming languages. And without more preamble, here is the code:
 +
 +<code plsql>
 +
 + create or replace function strip_html(dirty in clob,
 +                                      to_cvs in number default 0)
 +  return clob is out clob ;
 +  
 +  type arr_string is varray (200) of varchar2(64); 
 +  
 +  entities_search_for arr_string; 
 +  entities_replace arr_string;
 +  cont number; 
 +  
 +begin
 +
 +
 +-- to accelerate the issue
 +if dirty is null then 
 +   return dirty; 
 +end if; -- isnull(dirty)
 +
 +if length( dirty ) = 0 then 
 +   return dirty; 
 +end if; -- length(dirty) 
 +
 +entities_search_for := arr_string(
 +'&excl;',
 +'&num;',
 +'&dollar;',
 +'&percnt;',
 +'&amp;',
 +'&quot;',
 +'&lpar;',
 +'&rpar;',
 +'&midast;',
 +'&plus;',
 +'&comma;',
 +'&hyphen;',
 +'&period;',
 +'&sol;',
 +'&colon;',
 +'&semi;',
 +'&lt;',
 +'&equals;',
 +'&gt;',
 +'&quest;',
 +'&commat;',
 +'&lsqb;',
 +'&bsol;',
 +'&rsqb;',
 +'&circ;',
 +'&lowbar;',
 +'&grave;',
 +'&lcub;',
 +'&verbar;',
 +'&rcub;',
 +'&tilde;',
 +'&nbsp;',
 +'&iexcl;',
 +'&cent;',
 +'&pound;',
 +'&curren;',
 +'&yen;',
 +'&brvbar;',
 +'&sect;',
 +'&Dot;',
 +'&copy;',
 +'&ordf;',
 +'&laquo;',
 +'&not;',
 +'&shy;',
 +'&reg;',
 +'&macr;',
 +'&deg;',
 +'&plusmn;',
 +'&sup2;',
 +'&sup3;',
 +'&acute;',
 +'&micro;',
 +'&para;',
 +'&middot;',
 +'&cedil;',
 +'&sup1;',
 +'&ordm;',
 +'&raquo;',
 +'&fr;',
 +'&fr;',
 +'&fr;',
 +'&iquest;',
 +'&Agrave;',
 +'&Aacute;',
 +'&Acirc;',
 +'&Atilde;',
 +'&Auml;',
 +'&Aring;',
 +'&AElig;',
 +'&il;',
 +'&Egrave;',
 +'&Eacute;',
 +'&Ecirc;',
 +'&Euml;',
 +'&Igrave;',
 +'&Iacute;',
 +'&Icirc;',
 +'&Iuml;',
 +'&ETH;',
 +'&Ntilde;',
 +'&Ograve;',
 +'&Oacute;',
 +'&Ocirc;',
 +'&Otilde;',
 +'&Ouml;',
 +'&times;',
 +'&Oslash;',
 +'&Ugrave;',
 +'&Uacute;',
 +'&Ucirc;',
 +'&Uuml;',
 +'&Yacute;',
 +'&THORN;',
 +'&szlig;',
 +'&agrave;',
 +'&aacute;',
 +'&acirc;',
 +'&atilde;',
 +'&auml;',
 +'&egrave;',
 +'&eacute;',
 +'&ecirc;',
 +'&etilde;',
 +'&euml;',
 +'&igrave;',
 +'&iacute;',
 +'&icirc;',
 +'&itilde;',
 +'&iuml;',
 +'&ograve;',
 +'&oacute;',
 +'&ocirc;',
 +'&otilde;',
 +'&ouml;',
 +'&ugrave;',
 +'&uacute;',
 +'&ucirc;',
 +'&utilde;',
 +'&uuml;');
 +
 +entities_replace := arr_string(
 +'¡',
 +'º',
 +'$',
 +'%',
 +'&',
 +'"',
 +'(',
 +')',
 +'*',
 +'+',
 +',',
 +'-',
 +'.',
 +'Sol',
 +'Colon',
 +'*',
 +'<',
 +'=',
 +'>',
 +'?',
 +',',
 +'*',
 +'*',
 +'*',
 +'*',
 +'_',
 +'''',
 +'*',
 +'*',
 +'*',
 +'''',
 +' ',
 +'¡',
 +'cent',
 +'L',
 +'*',
 +'Y',
 +'*',
 +'*',
 +'.',
 +'(c)',
 +'*',
 +'*',
 +'!',
 +'*',
 +'(r)',
 +'*',
 +'*',
 +'*',
 +'*',
 +'*',
 +'á',
 +'u',
 +'*',
 +'·',
 +'ç',
 +'*',
 +'*',
 +'*',
 +'*',
 +'*',
 +'*',
 +'¿',
 +'È',
 +'Á',
 +'Ä',
 +'Á',
 +'*',
 +'*',
 +'AE',
 +'*',
 +'È',
 +'É',
 +'*',
 +'*',
 +'Ì',
 +'Í',
 +'Î',
 +'*',
 +'*',
 +'N',
 +'Ò',
 +'Ó',
 +'Ô',
 +'O',
 +'*',
 +'*',
 +'O',
 +'Ù',
 +'Ú',
 +'Û',
 +'*',
 +'*',
 +'*',
 +'*',
 +'à',
 +'á',
 +'â',
 +'a',
 +'*',
 +'è',
 +'é',
 +'ê',
 +'e',
 +'*',
 +'ì',
 +'í',
 +'î',
 +'i',
 +'*',
 +'ò',
 +'ó',
 +'ô',
 +'o',
 +'*',
 +'ù',
 +'ú',
 +'û',
 +'u',
 +'*'); 
 +
 +  out := dirty; 
 +
 +  -- replace what is enclosed between <xml> and </xml>
 +  -- *? -> lazy star (catches the minimum possible)
 +  out := regexp_replace(out, '<xml>.*?</xml>', '', 1, 0, 'ni' );
 +  -- clean what it is inside the style tags  
 +  out := regexp_replace(out, '<style>.*?</style>', '', 1, 0, 'ni' );
 +
 +  if to_cvs = 2 then
 +     -- sanitize (not clean) the html
 +
 +     -- clean the tag <?xml:whatever>
 +     out := regexp_replace(out, '<\?xml:.*?>', '', 1, 0, 'ni');
 +     -- clean the tags <img whatever>
 +     out := regexp_replace(out, '<img.*?>', '', 1, 0, 'ni');
 +     -- clean comments
 +     out := regexp_replace(out,'<!--.*?-->','', 1, 0, 'ni');
 +     -- clean meta
 +     out := regexp_replace(out,'<meta.*?>','', 1, 0, 'ni');
 +     -- clean link
 +     out := regexp_replace(out,'<link.*?>','', 1, 0, 'ni');
 +     -- clean DIV
 +     out := regexp_replace(out,'</?div.*?>','', 1, 0, 'ni');
 +     -- clean SPAN
 +     out := regexp_replace(out,'</?span.*?>','', 1, 0, 'ni');
 +     -- clean "class inside tags"
 +     out := regexp_replace(out,'(<.*?)class="?[a-zA-Z0-9-_]*"?(.*?>)', '\1\2', 1, 0, 'ni');
 +     -- clean "style" inside the following tags: i b p
 +     out := regexp_replace(out,'(<[ibp] .*?)style=".*?"(.*?>)', '\1\2', 1, 0, 'ni');
 +     -- clean namespaces <o:p> </o:p>
 +     out := regexp_replace(out, '(<)[a-zA-Z0-9-_]*:(.*?>)', '\1\2', 1, 0, 'ni');
 +     out := regexp_replace(out, '(</)[a-zA-Z0-9-_]*:(.*?>)', '\1\2', 1, 0, 'ni');
 +
 +     -- clean empty opening and closing tags: it has to be 
 +     -- passed twice or three times to clean things like this: 
 +     -- <strong><u></u></strong>
 +     -- TWEAK: <p></p> must be replaced by <br/>
 +     out := regexp_replace(out,'<p></p>','<br/>', 1, 0, 'ni');
 +     out := regexp_replace(out,'<([a-zA-Z0-9-_]*)></\1>','', 1, 0, 'ni');
 +     -- TWEAK: <p></p> must be replaced by <br/>
 +     out := regexp_replace(out,'<p></p>','<br/>', 1, 0, 'ni');
 +     out := regexp_replace(out,'<([a-zA-Z0-9-_]*)></\1>','', 1, 0, 'ni');
 +
 +  else
 +    -- clean html
 +
 +    -- replace all the stuff that is similar to a carriage return
 +    out := regexp_replace(out, '</p[^>]*>',chr(10)||chr(13));
 +    out := regexp_replace(out, '</br[^>]*>',chr(10)||chr(13));
 +    out := regexp_replace(out, '</tr[^>]*>',chr(10)||chr(13));
 +    
 +    -- replace all the remaining html stuff 
 +    out := regexp_replace(out,'<[^>]*>','', 1, 0, 'ni');
 +    
 +    -- replace all the entities
 +    for cont in 1..119 loop 
 +      out := replace( out, entities_search_for(cont), entities_replace(cont) );
 +    end loop; 
 +  
 +    -- cleaning for export to cvs
 +    if to_cvs = 1 then 
 +       out := replace( out, chr(10), '' ); 
 +       out := replace( out, chr(13), '' ); 
 +     out := replace( out, chr(9), '' ); 
 +       out := replace( out, ';', ',' ); 
 +       out := replace( out, '"', '''' ); 
 +    end if; 
 +
 +
 +  end if; 
 +
 +  
 +  return(out);
 +end strip_html;
 +
 +
 +
 +</code>
 +
 +
 +
 +
 +
  
oracle/fixhtml.txt · Last modified: 2022/12/02 22:02 by 127.0.0.1