oracle:fixhtml
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revisionLast revisionBoth sides next revision | ||
oracle:fixhtml [2010/11/03 13:32] – rlunaro | oracle:fixhtml [2013/04/05 18:30] – rlunaro | ||
---|---|---|---|
Line 1: | Line 1: | ||
- | ====== Fix or Sanitize HTML code from Word ====== | ||
- | |||
- | Yes: I've found the silver bullet for those of you who are seeking for a function that clean html code or sanitize it, specially if it comes from a cut and paste operation from word. | ||
- | |||
- | To the point. This snippet: | ||
- | |||
- | < | ||
- | select dirty, strip_html(dirty) from dual; | ||
- | </ | ||
- | |||
- | Removes all the HTML tags from the html code. But this one: | ||
- | |||
- | < | ||
- | select dirty, strip_html(dirty, | ||
- | </ | ||
- | |||
- | Wipes out all the garbage who is in the html code, leaving it --more or less-- " | ||
- | |||
- | <code plsql> | ||
- | |||
- | | ||
- | to_cvs in number default 0) | ||
- | return clob is out clob ; | ||
- | | ||
- | type arr_string is varray (200) of varchar2(64); | ||
- | | ||
- | entities_search_for arr_string; | ||
- | entities_replace arr_string; | ||
- | cont number; | ||
- | | ||
- | begin | ||
- | |||
- | |||
- | -- to accelerate the issue | ||
- | if dirty is null then | ||
- | | ||
- | end if; -- isnull(dirty) | ||
- | |||
- | if length( dirty ) = 0 then | ||
- | | ||
- | end if; -- length(dirty) | ||
- | |||
- | entities_search_for := arr_string( | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | '& | ||
- | |||
- | entities_replace := arr_string( | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | '&', | ||
- | '"', | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ',', | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | '<', | ||
- | ' | ||
- | '>', | ||
- | '?', | ||
- | ',', | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | '''', | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | '''', | ||
- | ' ', | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | ' | ||
- | |||
- | out := dirty; | ||
- | |||
- | -- replace what is enclosed between <xml> and </ | ||
- | -- *? -> lazy star (catches the minimum possible) | ||
- | out := regexp_replace(out, | ||
- | -- clean what it is inside the style tags | ||
- | out := regexp_replace(out, | ||
- | |||
- | if to_cvs = 2 then | ||
- | -- sanitize (not clean) the html | ||
- | |||
- | -- clean the tag <? | ||
- | out := regexp_replace(out, | ||
- | -- clean the tags <img whatever> | ||
- | out := regexp_replace(out, | ||
- | -- clean comments | ||
- | out := regexp_replace(out,'< | ||
- | -- clean meta | ||
- | out := regexp_replace(out,'< | ||
- | -- clean link | ||
- | out := regexp_replace(out,'< | ||
- | -- clean DIV | ||
- | out := regexp_replace(out,'</? | ||
- | -- clean SPAN | ||
- | out := regexp_replace(out,'</? | ||
- | -- clean "class inside tags" | ||
- | out := regexp_replace(out,' | ||
- | -- clean " | ||
- | out := regexp_replace(out,' | ||
- | -- clean namespaces <o:p> </ | ||
- | out := regexp_replace(out, | ||
- | out := regexp_replace(out, | ||
- | |||
- | -- clean empty opening and closing tags: it has to be | ||
- | -- passed twice or three times to clean things like this: | ||
- | -- < | ||
- | -- TWEAK: < | ||
- | out := regexp_replace(out,'< | ||
- | out := regexp_replace(out,'< | ||
- | -- TWEAK: < | ||
- | out := regexp_replace(out,'< | ||
- | out := regexp_replace(out,'< | ||
- | |||
- | else | ||
- | -- clean html | ||
- | |||
- | -- replace all the stuff that is similar to a carriage return | ||
- | out := regexp_replace(out, | ||
- | out := regexp_replace(out, | ||
- | out := regexp_replace(out, | ||
- | | ||
- | -- replace all the remaining html stuff | ||
- | out := regexp_replace(out,'< | ||
- | | ||
- | -- replace all the entities | ||
- | for cont in 1..119 loop | ||
- | out := replace( out, entities_search_for(cont), | ||
- | end loop; | ||
- | | ||
- | -- cleaning for export to cvs | ||
- | if to_cvs = 1 then | ||
- | out := replace( out, chr(10), '' | ||
- | out := replace( out, chr(13), '' | ||
- | out := replace( out, chr(9), '' | ||
- | out := replace( out, ';', | ||
- | out := replace( out, '"', | ||
- | end if; | ||
- | |||
- | |||
- | end if; | ||
- | |||
- | | ||
- | return(out); | ||
- | end strip_html; | ||
- | |||
- | |||
- | |||
- | </ | ||
- | |||
- | |||
- | |||
- | |||
- | |||
oracle/fixhtml.txt · Last modified: 2022/12/02 22:02 by 127.0.0.1