Fix or Sanitize HTML

Yes: I've found the silver bullet for those of you who are seeking for a function that clean html code or sanitize it, specially if it comes from a cut and paste operation from word.

To the point. This snippet:

select dirty, strip_html(dirty) from dual; 

Removes all the HTML tags from the html code. But this one:

select dirty, strip_html(dirty,2) from dual; 

Wipes out all the garbage who is in the html code, leaving it –more or less– “clean”. And yes, I am using regular expresions to perform the fixing, so it is easy to move it to java or other programming languages. And without more preamble, here is the code:

 CREATE OR REPLACE FUNCTION strip_html(dirty IN clob,
                                      to_cvs IN NUMBER DEFAULT 0)
  RETURN clob IS OUT clob ;
 
  TYPE arr_string IS varray (200) OF VARCHAR2(64); 
 
  entities_search_for arr_string; 
  entities_replace arr_string;
  cont NUMBER; 
 
BEGIN
 
 
-- to accelerate the issue
IF dirty IS NULL THEN 
   RETURN dirty; 
END IF; -- isnull(dirty)
 
IF LENGTH( dirty ) = 0 THEN 
   RETURN dirty; 
END IF; -- length(dirty) 
 
entities_search_for := arr_string(
'!',
'#',
'$',
'%',
'&',
'"',
'(',
')',
'*',
'+',
',',
'‐',
'.',
'/',
':',
';',
'<',
'=',
'>',
'?',
'@',
'[',
'\',
']',
'ˆ',
'_',
'`',
'{',
'|',
'}',
'˜',
' ',
'¡',
'¢',
'£',
'¤',
'¥',
'¦',
'§',
'¨',
'©',
'ª',
'«',
'¬',
'­',
'®',
'¯',
'°',
'±',
'²',
'³',
'´',
'µ',
'¶',
'·',
'¸',
'¹',
'º',
'»',
'&fr;',
'&fr;',
'&fr;',
'¿',
'À',
'Á',
'Â',
'Ã',
'Ä',
'Å',
'Æ',
'&il;',
'È',
'É',
'Ê',
'Ë',
'Ì',
'Í',
'Î',
'Ï',
'Ð',
'Ñ',
'Ò',
'Ó',
'Ô',
'Õ',
'Ö',
'×',
'Ø',
'Ù',
'Ú',
'Û',
'Ü',
'Ý',
'Þ',
'ß',
'à',
'á',
'â',
'ã',
'ä',
'è',
'é',
'ê',
'&etilde;',
'ë',
'ì',
'í',
'î',
'ĩ',
'ï',
'ò',
'ó',
'ô',
'õ',
'ö',
'ù',
'ú',
'û',
'ũ',
'ü');
 
entities_replace := arr_string(
'¡',
'º',
'$',
'%',
'&',
'"',
'(',
')',
'*',
'+',
',',
'-',
'.',
'Sol',
'Colon',
'*',
'<',
'=',
'>',
'?',
',',
'*',
'*',
'*',
'*',
'_',
'''',
'*',
'*',
'*',
'''',
' ',
'¡',
'cent',
'L',
'*',
'Y',
'*',
'*',
'.',
'(c)',
'*',
'*',
'!',
'*',
'(r)',
'*',
'*',
'*',
'*',
'*',
'á',
'u',
'*',
'·',
'ç',
'*',
'*',
'*',
'*',
'*',
'*',
'¿',
'È',
'Á',
'Ä',
'Á',
'*',
'*',
'AE',
'*',
'È',
'É',
'*',
'*',
'Ì',
'Í',
'Î',
'*',
'*',
'N',
'Ò',
'Ó',
'Ô',
'O',
'*',
'*',
'O',
'Ù',
'Ú',
'Û',
'*',
'*',
'*',
'*',
'à',
'á',
'â',
'a',
'*',
'è',
'é',
'ê',
'e',
'*',
'ì',
'í',
'î',
'i',
'*',
'ò',
'ó',
'ô',
'o',
'*',
'ù',
'ú',
'û',
'u',
'*'); 
 
  OUT := dirty; 
 
  -- replace what is enclosed between <xml> and </xml>
  -- *? -> lazy star (catches the minimum possible)
  OUT := regexp_replace(OUT, '<xml>.*?</xml>', '', 1, 0, 'ni' );
  -- clean what it is inside the style tags  
  OUT := regexp_replace(OUT, '<style>.*?</style>', '', 1, 0, 'ni' );
 
  IF to_cvs = 2 THEN
     -- sanitize (not clean) the html
 
     -- clean the tag <?xml:whatever>
     OUT := regexp_replace(OUT, '<\?xml:.*?>', '', 1, 0, 'ni');
     -- clean the tags <img whatever>
     OUT := regexp_replace(OUT, '<img.*?>', '', 1, 0, 'ni');
     -- clean comments
     OUT := regexp_replace(OUT,'<!--.*?-->','', 1, 0, 'ni');
     -- clean meta
     OUT := regexp_replace(OUT,'<meta.*?>','', 1, 0, 'ni');
     -- clean link
     OUT := regexp_replace(OUT,'<link.*?>','', 1, 0, 'ni');
     -- clean DIV
     OUT := regexp_replace(OUT,'</?div.*?>','', 1, 0, 'ni');
     -- clean SPAN
     OUT := regexp_replace(OUT,'</?span.*?>','', 1, 0, 'ni');
     -- clean "class inside tags"
     OUT := regexp_replace(OUT,'(<.*?)class="?[a-zA-Z0-9-_]*"?(.*?>)', '\1\2', 1, 0, 'ni');
     -- clean "style" inside the following tags: i b p
     OUT := regexp_replace(OUT,'(<[ibp] .*?)style=".*?"(.*?>)', '\1\2', 1, 0, 'ni');
     -- clean namespaces <o:p> </o:p>
     OUT := regexp_replace(OUT, '(<)[a-zA-Z0-9-_]*:(.*?>)', '\1\2', 1, 0, 'ni');
     OUT := regexp_replace(OUT, '(</)[a-zA-Z0-9-_]*:(.*?>)', '\1\2', 1, 0, 'ni');
 
     -- clean empty opening and closing tags: it has to be 
     -- passed twice or three times to clean things like this: 
     -- <strong><u></u></strong>
     -- TWEAK: <p></p> must be replaced by <br/>
     OUT := regexp_replace(OUT,'<p></p>','<br/>', 1, 0, 'ni');
     OUT := regexp_replace(OUT,'<([a-zA-Z0-9-_]*)></\1>','', 1, 0, 'ni');
     -- TWEAK: <p></p> must be replaced by <br/>
     OUT := regexp_replace(OUT,'<p></p>','<br/>', 1, 0, 'ni');
     OUT := regexp_replace(OUT,'<([a-zA-Z0-9-_]*)></\1>','', 1, 0, 'ni');
 
  ELSE
    -- clean html
 
    -- replace all the stuff that is similar to a carriage return
    OUT := regexp_replace(OUT, '</p[^>]*>',CHR(10)||CHR(13));
    OUT := regexp_replace(OUT, '</br[^>]*>',CHR(10)||CHR(13));
    OUT := regexp_replace(OUT, '</tr[^>]*>',CHR(10)||CHR(13));
 
    -- replace all the remaining html stuff 
    OUT := regexp_replace(OUT,'<[^>]*>','', 1, 0, 'ni');
 
    -- replace all the entities
    FOR cont IN 1..119 LOOP 
      OUT := REPLACE( OUT, entities_search_for(cont), entities_replace(cont) );
    END LOOP; 
 
    -- cleaning for export to cvs
    IF to_cvs = 1 THEN 
       OUT := REPLACE( OUT, CHR(10), '' ); 
       OUT := REPLACE( OUT, CHR(13), '' ); 
  	   OUT := REPLACE( OUT, CHR(9), '' ); 
       OUT := REPLACE( OUT, ';', ',' ); 
       OUT := REPLACE( OUT, '"', '''' ); 
    END IF; 
 
 
  END IF; 
 
 
  RETURN(OUT);
END strip_html;