oracle:fixhtml
Fix or Sanitize HTML
Yes: I've found the silver bullet for those of you who are seeking for a function that clean html code or sanitize it, specially if it comes from a cut and paste operation from word.
To the point. This snippet:
select dirty, strip_html(dirty) from dual;
Removes all the HTML tags from the html code. But this one:
select dirty, strip_html(dirty,2) from dual;
Wipes out all the garbage who is in the html code, leaving it –more or less– “clean”. And yes, I am using regular expresions to perform the fixing, so it is easy to move it to java or other programming languages. And without more preamble, here is the code:
CREATE OR REPLACE FUNCTION strip_html(dirty IN clob, to_cvs IN NUMBER DEFAULT 0) RETURN clob IS OUT clob ; TYPE arr_string IS varray (200) OF VARCHAR2(64); entities_search_for arr_string; entities_replace arr_string; cont NUMBER; BEGIN -- to accelerate the issue IF dirty IS NULL THEN RETURN dirty; END IF; -- isnull(dirty) IF LENGTH( dirty ) = 0 THEN RETURN dirty; END IF; -- length(dirty) entities_search_for := arr_string( '!', '#', '$', '%', '&', '"', '(', ')', '*', '+', ',', '‐', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\', ']', 'ˆ', '_', '`', '{', '|', '}', '˜', ' ', '¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', 'ª', '«', '¬', '­', '®', '¯', '°', '±', '²', '³', '´', 'µ', '¶', '·', '¸', '¹', 'º', '»', '&fr;', '&fr;', '&fr;', '¿', 'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', '&il;', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', '×', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'è', 'é', 'ê', '&etilde;', 'ë', 'ì', 'í', 'î', 'ĩ', 'ï', 'ò', 'ó', 'ô', 'õ', 'ö', 'ù', 'ú', 'û', 'ũ', 'ü'); entities_replace := arr_string( '¡', 'º', '$', '%', '&', '"', '(', ')', '*', '+', ',', '-', '.', 'Sol', 'Colon', '*', '<', '=', '>', '?', ',', '*', '*', '*', '*', '_', '''', '*', '*', '*', '''', ' ', '¡', 'cent', 'L', '*', 'Y', '*', '*', '.', '(c)', '*', '*', '!', '*', '(r)', '*', '*', '*', '*', '*', 'á', 'u', '*', '·', 'ç', '*', '*', '*', '*', '*', '*', '¿', 'È', 'Á', 'Ä', 'Á', '*', '*', 'AE', '*', 'È', 'É', '*', '*', 'Ì', 'Í', 'Î', '*', '*', 'N', 'Ò', 'Ó', 'Ô', 'O', '*', '*', 'O', 'Ù', 'Ú', 'Û', '*', '*', '*', '*', 'à', 'á', 'â', 'a', '*', 'è', 'é', 'ê', 'e', '*', 'ì', 'í', 'î', 'i', '*', 'ò', 'ó', 'ô', 'o', '*', 'ù', 'ú', 'û', 'u', '*'); OUT := dirty; -- replace what is enclosed between <xml> and </xml> -- *? -> lazy star (catches the minimum possible) OUT := regexp_replace(OUT, '<xml>.*?</xml>', '', 1, 0, 'ni' ); -- clean what it is inside the style tags OUT := regexp_replace(OUT, '<style>.*?</style>', '', 1, 0, 'ni' ); IF to_cvs = 2 THEN -- sanitize (not clean) the html -- clean the tag <?xml:whatever> OUT := regexp_replace(OUT, '<\?xml:.*?>', '', 1, 0, 'ni'); -- clean the tags <img whatever> OUT := regexp_replace(OUT, '<img.*?>', '', 1, 0, 'ni'); -- clean comments OUT := regexp_replace(OUT,'<!--.*?-->','', 1, 0, 'ni'); -- clean meta OUT := regexp_replace(OUT,'<meta.*?>','', 1, 0, 'ni'); -- clean link OUT := regexp_replace(OUT,'<link.*?>','', 1, 0, 'ni'); -- clean DIV OUT := regexp_replace(OUT,'</?div.*?>','', 1, 0, 'ni'); -- clean SPAN OUT := regexp_replace(OUT,'</?span.*?>','', 1, 0, 'ni'); -- clean "class inside tags" OUT := regexp_replace(OUT,'(<.*?)class="?[a-zA-Z0-9-_]*"?(.*?>)', '\1\2', 1, 0, 'ni'); -- clean "style" inside the following tags: i b p OUT := regexp_replace(OUT,'(<[ibp] .*?)style=".*?"(.*?>)', '\1\2', 1, 0, 'ni'); -- clean namespaces <o:p> </o:p> OUT := regexp_replace(OUT, '(<)[a-zA-Z0-9-_]*:(.*?>)', '\1\2', 1, 0, 'ni'); OUT := regexp_replace(OUT, '(</)[a-zA-Z0-9-_]*:(.*?>)', '\1\2', 1, 0, 'ni'); -- clean empty opening and closing tags: it has to be -- passed twice or three times to clean things like this: -- <strong><u></u></strong> -- TWEAK: <p></p> must be replaced by <br/> OUT := regexp_replace(OUT,'<p></p>','<br/>', 1, 0, 'ni'); OUT := regexp_replace(OUT,'<([a-zA-Z0-9-_]*)></\1>','', 1, 0, 'ni'); -- TWEAK: <p></p> must be replaced by <br/> OUT := regexp_replace(OUT,'<p></p>','<br/>', 1, 0, 'ni'); OUT := regexp_replace(OUT,'<([a-zA-Z0-9-_]*)></\1>','', 1, 0, 'ni'); ELSE -- clean html -- replace all the stuff that is similar to a carriage return OUT := regexp_replace(OUT, '</p[^>]*>',CHR(10)||CHR(13)); OUT := regexp_replace(OUT, '</br[^>]*>',CHR(10)||CHR(13)); OUT := regexp_replace(OUT, '</tr[^>]*>',CHR(10)||CHR(13)); -- replace all the remaining html stuff OUT := regexp_replace(OUT,'<[^>]*>','', 1, 0, 'ni'); -- replace all the entities FOR cont IN 1..119 LOOP OUT := REPLACE( OUT, entities_search_for(cont), entities_replace(cont) ); END LOOP; -- cleaning for export to cvs IF to_cvs = 1 THEN OUT := REPLACE( OUT, CHR(10), '' ); OUT := REPLACE( OUT, CHR(13), '' ); OUT := REPLACE( OUT, CHR(9), '' ); OUT := REPLACE( OUT, ';', ',' ); OUT := REPLACE( OUT, '"', '''' ); END IF; END IF; RETURN(OUT); END strip_html;
oracle/fixhtml.txt · Last modified: 2024/10/05 17:05 by rlunaro