WITH FUNCTION clean_col(p_text CLOB) RETURN CLOB IS
v_text CLOB := p_text;
BEGIN
IF v_text IS NULL THEN
RETURN NULL;
END IF;
v_text := REGEXP_REPLACE(v_text, '<!DOCTYPE[^>]*>', '', 1, 0, 'i');
-- Remove <o:p> tags
v_text := REGEXP_REPLACE(v_text, '<o:p>.*?</o:p>', '', 1, 0, 'in');
-- Remove HTML comments
v_text := REGEXP_REPLACE(v_text, '<!--.*?-->', '', 1, 0, 'in');
-- Remove style blocks
v_text := REGEXP_REPLACE(v_text, '<style[^>]*>.*?</style>', '', 1, 0, 'in');
-- Decode HTML entities
v_text := REPLACE(REPLACE(REPLACE(REPLACE(v_text,
'"', '"'),
'>', '>'),
'<', '<'),
'&', '&');
v_text := REGEXP_REPLACE(v_text, ' | ', ' ', 1, 0, 'i');
-- Convert block-level tags to newlines
v_text := REGEXP_REPLACE(v_text,
'(<p[^>]*>|</p>|<br\s*/?>|<div[^>]*>|</div>|<h[1-6][^>]*>|</h[1-6]>|<hr[^>]*>|<ul[^>]*>|</ul>|<ol[^>]*>|</ol>)',
CHR(10), 1, 0, 'in');
-- Convert <tr> to newline
v_text := REGEXP_REPLACE(v_text, '</tr\s*>', CHR(10), 1, 0, 'in');
-- Convert <td> and <th> to tab
v_text := REGEXP_REPLACE(v_text, '<(td|th)[^>]*>', CHR(9), 1, 0, 'in');
-- Convert <li> to "- text" on the same line
v_text := REGEXP_REPLACE(
v_text,
'<li[^>]*>\s*(.*?)\s*</li>',
'• \1' || CHR(10),
1, 0, 'in'
);
v_text := REGEXP_REPLACE(
v_text,
'<ol[^>]*>(.*?)</ol>',
'• \1' || CHR(10),
1, 0, 'in'
);
-- Strip remaining HTML tags
v_text := REGEXP_REPLACE(v_text, '</?[A-Za-z][A-Za-z0-9]*(\s+[^>]*)?>', '', 1, 0, 'in');
-- Collapse multiple newlines
-- Trim leading and trailing spaces/newlines
v_text := REGEXP_REPLACE(v_text, '^[[:space:]\r\n]+', '');
v_text := REGEXP_REPLACE(v_text, '[[:space:]\r\n]+$', '');
RETURN v_text;
END;
SELECT
clean_col(clean_col(clob_data)) AS cleaned_text
FROM table