Skip to Main Content

SQL & PL/SQL

Announcement

For appeals, questions and feedback about Oracle Forums, please email oracle-forums-moderators_us@oracle.com. Technical questions should be asked in the appropriate category. Thank you!

Solved - Regular Expressions for converting HTML to Structured Plain Text

WITH FUNCTION clean_col(p_text CLOB) RETURN CLOB IS
    v_text CLOB := p_text;
BEGIN
    IF v_text IS NULL THEN
        RETURN NULL;
    END IF;
    
    v_text := REGEXP_REPLACE(v_text, '<!DOCTYPE[^>]*>', '', 1, 0, 'i');


    -- Remove <o:p> tags
    v_text := REGEXP_REPLACE(v_text, '<o:p>.*?</o:p>', '', 1, 0, 'in');

    -- Remove HTML comments
    v_text := REGEXP_REPLACE(v_text, '<!--.*?-->', '', 1, 0, 'in');

    -- Remove style blocks
    v_text := REGEXP_REPLACE(v_text, '<style[^>]*>.*?</style>', '', 1, 0, 'in');

    -- Decode HTML entities
    v_text := REPLACE(REPLACE(REPLACE(REPLACE(v_text,
                '&quot;', '"'),
                '&gt;', '>'),
                '&lt;', '<'),
                '&amp;', '&');
    v_text := REGEXP_REPLACE(v_text, '&nbsp;|&#160;', ' ', 1, 0, 'i');

    -- Convert block-level tags to newlines
    v_text := REGEXP_REPLACE(v_text,
                '(<p[^>]*>|</p>|<br\s*/?>|<div[^>]*>|</div>|<h[1-6][^>]*>|</h[1-6]>|<hr[^>]*>|<ul[^>]*>|</ul>|<ol[^>]*>|</ol>)',
                CHR(10), 1, 0, 'in');

    -- Convert <tr> to newline
    v_text := REGEXP_REPLACE(v_text, '</tr\s*>', CHR(10), 1, 0, 'in');

    -- Convert <td> and <th> to tab
    v_text := REGEXP_REPLACE(v_text, '<(td|th)[^>]*>', CHR(9), 1, 0, 'in');

   
    -- Convert <li> to "- text" on the same line
    v_text := REGEXP_REPLACE(
             v_text,
             '<li[^>]*>\s*(.*?)\s*</li>',
             '• \1' || CHR(10),
             1, 0, 'in'
          );
          
    v_text := REGEXP_REPLACE(
             v_text,
             '<ol[^>]*>(.*?)</ol>',
             '• \1' || CHR(10),
             1, 0, 'in'
          );

    -- Strip remaining HTML tags
    v_text := REGEXP_REPLACE(v_text, '</?[A-Za-z][A-Za-z0-9]*(\s+[^>]*)?>', '', 1, 0, 'in');

    -- Collapse multiple newlines

    -- Trim leading and trailing spaces/newlines
    v_text := REGEXP_REPLACE(v_text, '^[[:space:]\r\n]+', '');
    v_text := REGEXP_REPLACE(v_text, '[[:space:]\r\n]+$', '');

    RETURN v_text;
END;

SELECT 
       clean_col(clean_col(clob_data)) AS cleaned_text
FROM table
Comments
Post Details
Added 6 days ago
1 comment
83 views