User Tools

Site Tools


oracle:fixhtml

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
Next revision
Previous revision
oracle:fixhtml [2011/03/19 20:37]
127.0.0.1 external edit
oracle:fixhtml [2014/12/24 10:42] (current)
Line 1: Line 1:
 +====== Fix or Sanitize HTML code from Microsoft Word ======
 +
 +Yes: I've found the silver bullet for those of you who are seeking for a function that clean html code or sanitize it, specially if it comes from a cut and paste operation from word. 
 +
 +To the point. This snippet: ​
 +
 +<​code>​
 +select dirty, strip_html(dirty) from dual; 
 +</​code>​
 +
 +Removes all the HTML tags from the html code. But this one: 
 +
 +<​code>​
 +select dirty, strip_html(dirty,​2) from dual; 
 +</​code>​
 +
 +Wipes out all the garbage who is in the html code, leaving it --more or less-- "​clean"​. And yes, I am using regular expresions to perform the fixing, so it is easy to move it to java or other programming languages. And without more preamble, here is the code:
 +
 +<code plsql>
 +
 + ​create or replace function strip_html(dirty in clob,
 +                                      to_cvs in number default 0)
 +  return clob is out clob ;
 +  ​
 +  type arr_string is varray (200) of varchar2(64); ​
 +  ​
 +  entities_search_for arr_string; ​
 +  entities_replace arr_string;
 +  cont number; ​
 +  ​
 +begin
 +
 +
 +-- to accelerate the issue
 +if dirty is null then 
 +   ​return dirty; ​
 +end if; -- isnull(dirty)
 +
 +if length( dirty ) = 0 then 
 +   ​return dirty; ​
 +end if; -- length(dirty) ​
 +
 +entities_search_for := arr_string(
 +'&​excl;',​
 +'&​num;',​
 +'&​dollar;',​
 +'&​percnt;',​
 +'&​amp;',​
 +'&​quot;',​
 +'&​lpar;',​
 +'&​rpar;',​
 +'&​midast;',​
 +'&​plus;',​
 +'&​comma;',​
 +'&​hyphen;',​
 +'&​period;',​
 +'&​sol;',​
 +'&​colon;',​
 +'&​semi;',​
 +'&​lt;',​
 +'&​equals;',​
 +'&​gt;',​
 +'&​quest;',​
 +'&​commat;',​
 +'&​lsqb;',​
 +'&​bsol;',​
 +'&​rsqb;',​
 +'&​circ;',​
 +'&​lowbar;',​
 +'&​grave;',​
 +'&​lcub;',​
 +'&​verbar;',​
 +'&​rcub;',​
 +'&​tilde;',​
 +'&​nbsp;',​
 +'&​iexcl;',​
 +'&​cent;',​
 +'&​pound;',​
 +'&​curren;',​
 +'&​yen;',​
 +'&​brvbar;',​
 +'&​sect;',​
 +'&​Dot;',​
 +'&​copy;',​
 +'&​ordf;',​
 +'&​laquo;',​
 +'&​not;',​
 +'&​shy;',​
 +'&​reg;',​
 +'&​macr;',​
 +'&​deg;',​
 +'&​plusmn;',​
 +'&​sup2;',​
 +'&​sup3;',​
 +'&​acute;',​
 +'&​micro;',​
 +'&​para;',​
 +'&​middot;',​
 +'&​cedil;',​
 +'&​sup1;',​
 +'&​ordm;',​
 +'&​raquo;',​
 +'&​fr;',​
 +'&​fr;',​
 +'&​fr;',​
 +'&​iquest;',​
 +'&​Agrave;',​
 +'&​Aacute;',​
 +'&​Acirc;',​
 +'&​Atilde;',​
 +'&​Auml;',​
 +'&​Aring;',​
 +'&​AElig;',​
 +'&​il;',​
 +'&​Egrave;',​
 +'&​Eacute;',​
 +'&​Ecirc;',​
 +'&​Euml;',​
 +'&​Igrave;',​
 +'&​Iacute;',​
 +'&​Icirc;',​
 +'&​Iuml;',​
 +'&​ETH;',​
 +'&​Ntilde;',​
 +'&​Ograve;',​
 +'&​Oacute;',​
 +'&​Ocirc;',​
 +'&​Otilde;',​
 +'&​Ouml;',​
 +'&​times;',​
 +'&​Oslash;',​
 +'&​Ugrave;',​
 +'&​Uacute;',​
 +'&​Ucirc;',​
 +'&​Uuml;',​
 +'&​Yacute;',​
 +'&​THORN;',​
 +'&​szlig;',​
 +'&​agrave;',​
 +'&​aacute;',​
 +'&​acirc;',​
 +'&​atilde;',​
 +'&​auml;',​
 +'&​egrave;',​
 +'&​eacute;',​
 +'&​ecirc;',​
 +'&​etilde;',​
 +'&​euml;',​
 +'&​igrave;',​
 +'&​iacute;',​
 +'&​icirc;',​
 +'&​itilde;',​
 +'&​iuml;',​
 +'&​ograve;',​
 +'&​oacute;',​
 +'&​ocirc;',​
 +'&​otilde;',​
 +'&​ouml;',​
 +'&​ugrave;',​
 +'&​uacute;',​
 +'&​ucirc;',​
 +'&​utilde;',​
 +'&​uuml;'​);​
 +
 +entities_replace := arr_string(
 +'​¡',​
 +'​º',​
 +'​$',​
 +'​%',​
 +'&',​
 +'"',​
 +'​(',​
 +'​)',​
 +'​*',​
 +'​+',​
 +',',​
 +'​-',​
 +'​.',​
 +'​Sol',​
 +'​Colon',​
 +'​*',​
 +'<',​
 +'​=',​
 +'>',​
 +'?',​
 +',',​
 +'​*',​
 +'​*',​
 +'​*',​
 +'​*',​
 +'​_',​
 +'''',​
 +'​*',​
 +'​*',​
 +'​*',​
 +'''',​
 +' ',
 +'​¡',​
 +'​cent',​
 +'​L',​
 +'​*',​
 +'​Y',​
 +'​*',​
 +'​*',​
 +'​.',​
 +'​(c)',​
 +'​*',​
 +'​*',​
 +'​!',​
 +'​*',​
 +'​(r)',​
 +'​*',​
 +'​*',​
 +'​*',​
 +'​*',​
 +'​*',​
 +'​á',​
 +'​u',​
 +'​*',​
 +'​·',​
 +'​ç',​
 +'​*',​
 +'​*',​
 +'​*',​
 +'​*',​
 +'​*',​
 +'​*',​
 +'​¿',​
 +'​È',​
 +'​Á',​
 +'​Ä',​
 +'​Á',​
 +'​*',​
 +'​*',​
 +'​AE',​
 +'​*',​
 +'​È',​
 +'​É',​
 +'​*',​
 +'​*',​
 +'​Ì',​
 +'​Í',​
 +'​Î',​
 +'​*',​
 +'​*',​
 +'​N',​
 +'​Ò',​
 +'​Ó',​
 +'​Ô',​
 +'​O',​
 +'​*',​
 +'​*',​
 +'​O',​
 +'​Ù',​
 +'​Ú',​
 +'​Û',​
 +'​*',​
 +'​*',​
 +'​*',​
 +'​*',​
 +'​à',​
 +'​á',​
 +'​â',​
 +'​a',​
 +'​*',​
 +'​è',​
 +'​é',​
 +'​ê',​
 +'​e',​
 +'​*',​
 +'​ì',​
 +'​í',​
 +'​î',​
 +'​i',​
 +'​*',​
 +'​ò',​
 +'​ó',​
 +'​ô',​
 +'​o',​
 +'​*',​
 +'​ù',​
 +'​ú',​
 +'​û',​
 +'​u',​
 +'​*'​); ​
 +
 +  out := dirty; ​
 +
 +  -- replace what is enclosed between <xml> and </​xml>​
 +  -- *? -> lazy star (catches the minimum possible)
 +  out := regexp_replace(out,​ '<​xml>​.*?</​xml>',​ '',​ 1, 0, '​ni'​ );
 +  -- clean what it is inside the style tags  ​
 +  out := regexp_replace(out,​ '<​style>​.*?</​style>',​ '',​ 1, 0, '​ni'​ );
 +
 +  if to_cvs = 2 then
 +     -- sanitize (not clean) the html
 +
 +     -- clean the tag <?​xml:​whatever>​
 +     out := regexp_replace(out,​ '<​\?​xml:​.*?>',​ '',​ 1, 0, '​ni'​);​
 +     -- clean the tags <img whatever>​
 +     out := regexp_replace(out,​ '<​img.*?>',​ '',​ 1, 0, '​ni'​);​
 +     -- clean comments
 +     out := regexp_replace(out,'<​!--.*?​-->','',​ 1, 0, '​ni'​);​
 +     -- clean meta
 +     out := regexp_replace(out,'<​meta.*?>','',​ 1, 0, '​ni'​);​
 +     -- clean link
 +     out := regexp_replace(out,'<​link.*?>','',​ 1, 0, '​ni'​);​
 +     -- clean DIV
 +     out := regexp_replace(out,'</?​div.*?>','',​ 1, 0, '​ni'​);​
 +     -- clean SPAN
 +     out := regexp_replace(out,'</?​span.*?>','',​ 1, 0, '​ni'​);​
 +     -- clean "class inside tags"
 +     out := regexp_replace(out,'​(<​.*?​)class="?​[a-zA-Z0-9-_]*"?​(.*?>​)',​ '​\1\2',​ 1, 0, '​ni'​);​
 +     -- clean "​style"​ inside the following tags: i b p
 +     out := regexp_replace(out,'​(<​[ibp] .*?​)style="​.*?"​(.*?>​)',​ '​\1\2',​ 1, 0, '​ni'​);​
 +     -- clean namespaces <o:p> </​o:​p>​
 +     out := regexp_replace(out,​ '​(<​)[a-zA-Z0-9-_]*:​(.*?>​)',​ '​\1\2',​ 1, 0, '​ni'​);​
 +     out := regexp_replace(out,​ '​(</​)[a-zA-Z0-9-_]*:​(.*?>​)',​ '​\1\2',​ 1, 0, '​ni'​);​
 +
 +     -- clean empty opening and closing tags: it has to be 
 +     -- passed twice or three times to clean things like this: 
 +     -- <​strong><​u></​u></​strong>​
 +     -- TWEAK: <​p></​p>​ must be replaced by <br/>
 +     out := regexp_replace(out,'<​p></​p>','<​br/>',​ 1, 0, '​ni'​);​
 +     out := regexp_replace(out,'<​([a-zA-Z0-9-_]*)></​\1>','',​ 1, 0, '​ni'​);​
 +     -- TWEAK: <​p></​p>​ must be replaced by <br/>
 +     out := regexp_replace(out,'<​p></​p>','<​br/>',​ 1, 0, '​ni'​);​
 +     out := regexp_replace(out,'<​([a-zA-Z0-9-_]*)></​\1>','',​ 1, 0, '​ni'​);​
 +
 +  else
 +    -- clean html
 +
 +    -- replace all the stuff that is similar to a carriage return
 +    out := regexp_replace(out,​ '</​p[^>​]*>',​chr(10)||chr(13));​
 +    out := regexp_replace(out,​ '</​br[^>​]*>',​chr(10)||chr(13));​
 +    out := regexp_replace(out,​ '</​tr[^>​]*>',​chr(10)||chr(13));​
 +    ​
 +    -- replace all the remaining html stuff 
 +    out := regexp_replace(out,'<​[^>​]*>','',​ 1, 0, '​ni'​);​
 +    ​
 +    -- replace all the entities
 +    for cont in 1..119 loop 
 +      out := replace( out, entities_search_for(cont),​ entities_replace(cont) );
 +    end loop; 
 +  ​
 +    -- cleaning for export to cvs
 +    if to_cvs = 1 then 
 +       out := replace( out, chr(10), ''​ ); 
 +       out := replace( out, chr(13), ''​ ); 
 +     out := replace( out, chr(9), ''​ ); 
 +       out := replace( out, ';',​ ','​ ); 
 +       out := replace( out, '"',​ ''''​ ); 
 +    end if; 
 +
 +
 +  end if; 
 +
 +  ​
 +  return(out);​
 +end strip_html;
 +
 +
 +
 +</​code>​
 +
 +
 +
 +
 +
  
oracle/fixhtml.txt · Last modified: 2014/12/24 10:42 (external edit)