====== Fix or Sanitize HTML code from Microsoft Word ====== Yes: I've found the silver bullet for those of you who are seeking for a function that clean html code or sanitize it, specially if it comes from a cut and paste operation from word. To the point. This snippet: select dirty, strip_html(dirty) from dual; Removes all the HTML tags from the html code. But this one: select dirty, strip_html(dirty,2) from dual; Wipes out all the garbage who is in the html code, leaving it --more or less-- "clean". And yes, I am using regular expresions to perform the fixing, so it is easy to move it to java or other programming languages. And without more preamble, here is the code: create or replace function strip_html(dirty in clob, to_cvs in number default 0) return clob is out clob ; type arr_string is varray (200) of varchar2(64); entities_search_for arr_string; entities_replace arr_string; cont number; begin -- to accelerate the issue if dirty is null then return dirty; end if; -- isnull(dirty) if length( dirty ) = 0 then return dirty; end if; -- length(dirty) entities_search_for := arr_string( '!', '#', '$', '%', '&', '"', '(', ')', '*', '+', ',', '‐', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\', ']', 'ˆ', '_', '`', '{', '|', '}', '˜', ' ', '¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', 'ª', '«', '¬', '­', '®', '¯', '°', '±', '²', '³', '´', 'µ', '¶', '·', '¸', '¹', 'º', '»', '&fr;', '&fr;', '&fr;', '¿', 'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', '&il;', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', '×', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'è', 'é', 'ê', '&etilde;', 'ë', 'ì', 'í', 'î', 'ĩ', 'ï', 'ò', 'ó', 'ô', 'õ', 'ö', 'ù', 'ú', 'û', 'ũ', 'ü'); entities_replace := arr_string( '¡', 'º', '$', '%', '&', '"', '(', ')', '*', '+', ',', '-', '.', 'Sol', 'Colon', '*', '<', '=', '>', '?', ',', '*', '*', '*', '*', '_', '''', '*', '*', '*', '''', ' ', '¡', 'cent', 'L', '*', 'Y', '*', '*', '.', '(c)', '*', '*', '!', '*', '(r)', '*', '*', '*', '*', '*', 'á', 'u', '*', '·', 'ç', '*', '*', '*', '*', '*', '*', '¿', 'È', 'Á', 'Ä', 'Á', '*', '*', 'AE', '*', 'È', 'É', '*', '*', 'Ì', 'Í', 'Î', '*', '*', 'N', 'Ò', 'Ó', 'Ô', 'O', '*', '*', 'O', 'Ù', 'Ú', 'Û', '*', '*', '*', '*', 'à', 'á', 'â', 'a', '*', 'è', 'é', 'ê', 'e', '*', 'ì', 'í', 'î', 'i', '*', 'ò', 'ó', 'ô', 'o', '*', 'ù', 'ú', 'û', 'u', '*'); out := dirty; -- replace what is enclosed between and -- *? -> lazy star (catches the minimum possible) out := regexp_replace(out, '.*?', '', 1, 0, 'ni' ); -- clean what it is inside the style tags out := regexp_replace(out, '', '', 1, 0, 'ni' ); if to_cvs = 2 then -- sanitize (not clean) the html -- clean the tag out := regexp_replace(out, '<\?xml:.*?>', '', 1, 0, 'ni'); -- clean the tags out := regexp_replace(out, '', '', 1, 0, 'ni'); -- clean comments out := regexp_replace(out,'','', 1, 0, 'ni'); -- clean meta out := regexp_replace(out,'','', 1, 0, 'ni'); -- clean link out := regexp_replace(out,'','', 1, 0, 'ni'); -- clean DIV out := regexp_replace(out,'','', 1, 0, 'ni'); -- clean SPAN out := regexp_replace(out,'','', 1, 0, 'ni'); -- clean "class inside tags" out := regexp_replace(out,'(<.*?)class="?[a-zA-Z0-9-_]*"?(.*?>)', '\1\2', 1, 0, 'ni'); -- clean "style" inside the following tags: i b p out := regexp_replace(out,'(<[ibp] .*?)style=".*?"(.*?>)', '\1\2', 1, 0, 'ni'); -- clean namespaces out := regexp_replace(out, '(<)[a-zA-Z0-9-_]*:(.*?>)', '\1\2', 1, 0, 'ni'); out := regexp_replace(out, '()', '\1\2', 1, 0, 'ni'); -- clean empty opening and closing tags: it has to be -- passed twice or three times to clean things like this: -- -- TWEAK:

must be replaced by
out := regexp_replace(out,'

','
', 1, 0, 'ni'); out := regexp_replace(out,'<([a-zA-Z0-9-_]*)>','', 1, 0, 'ni'); -- TWEAK:

must be replaced by
out := regexp_replace(out,'

','
', 1, 0, 'ni'); out := regexp_replace(out,'<([a-zA-Z0-9-_]*)>','', 1, 0, 'ni'); else -- clean html -- replace all the stuff that is similar to a carriage return out := regexp_replace(out, ']*>',chr(10)||chr(13)); out := regexp_replace(out, ']*>',chr(10)||chr(13)); out := regexp_replace(out, ']*>',chr(10)||chr(13)); -- replace all the remaining html stuff out := regexp_replace(out,'<[^>]*>','', 1, 0, 'ni'); -- replace all the entities for cont in 1..119 loop out := replace( out, entities_search_for(cont), entities_replace(cont) ); end loop; -- cleaning for export to cvs if to_cvs = 1 then out := replace( out, chr(10), '' ); out := replace( out, chr(13), '' ); out := replace( out, chr(9), '' ); out := replace( out, ';', ',' ); out := replace( out, '"', '''' ); end if; end if; return(out); end strip_html;