Wiki Conversion OddMuse To Confluence

I asked some questions about regular expressions on the Tclers Chat and those who helped me with my answers suggested I post the solutions to a new page, so I did here Advanced Regular Expression Examples. In addition, they thought I should post my entire code, so I have, below. buchs


#!/usr/bin/tclsh

# Convert Wiki Content from the OddMuse (http://www.oddmuse.org) wiki syntax to
# Confluence (http://www.atlassian.com/software/confluence/) wiki syntax.  
# Results is a series of .txt files that need to be imported via Confluence.

# Kevin Buchs, Mayo Clinic, 2008

# Specify the location of the OddMuse storage and the directory where you want output files written.
set gbl(wikisource) //sppdgfs/local/doc/oddmuse-sppdg/page/*
set gbl(work) /users/buchs/wiki_import

# This converts the contents of each file found in OddMuse
proc convert_body {thebody} {

    # double-bracketed expressions with URLs
    regsub -all {\[\[(http://.*?) (.*?)\]\]} $thebody {[\2|\1]} thebody   

    # double-bracketed expressions without URLs
    # internal links might have spaces between the words, e.g. [[My Node]] and these
    # are mapped by OddMuse to underscores for the sake of the file names - which are
    # how the pages are referenced.  So, we need to take those spaces out first.
    # Someone have a cleaner way to do this without the repetition?
    while { [regsub -all {(\[\[[^] ]+) (.+?\]\])} $thebody {\1_\2} thebody] > 0 } {}

    # then make the double brackets single brackets
    regsub -all {\[\[(.*?)\]\]} $thebody {[\1]} thebody

    # various single-bracketed expressions with URLs
    regsub -all {\[(http://[^] ]*?)\]} $thebody {[note|\1]} thebody
    regsub -all {\[(http://[^ ]*?) ([^]]*?)\]} $thebody {[\2|\1]} thebody
    regsub -all {\[image:([^] ]*?)[ ]*([^]]*?)\]} $thebody {!\1!} thebody
    regsub -all {\[image:([^] ]*)[ ]*([^]]*?)\]} $thebody {!\1!} thebody

    # look for CamelCase words and force them to links - but check that they are not already links
    # old version: regsub -all {(\s)([A-Z]+?[a-z]+?[A-Z]+?.*?)(\s)} $thebody {\1[\2]\3} thebody
    regsub -all {([^A-Za-z)([A-Z]+?[a-z]+?[A-Z]+?.*?)(\s)} $thebody {\1[\2]\3} thebody

    # look for escaped CamelCase words, remove the escape.
    regsub -all {(!)([A-Z]+?[a-z]+?[A-Z])} $thebody {\2} thebody

    # Headings - start lowest to highest
    regsub -all {====== *([^=\n]+?) *======} $thebody {.h6 \1} thebody
    regsub -all {===== *([^=\n]+?) *=====} $thebody {.h5 \1} thebody
    regsub -all {==== *([^=\n]+?) *====} $thebody {.h4 \1} thebody
    regsub -all {=== *([^=\n]+?) *===} $thebody {.h3 \1} thebody
    regsub -all {== *([^=\n]+?) *==} $thebody {.h2 \1} thebody
    regsub -all {= *([^=\n]+?) *=} $thebody {.h1 \1} thebody
  
    # Lists - nothing to do for bullets or numbered.  For definition lists there 
    # is no good analog.  Just approximate by bolding the term with a colon and 
    # two spaces following
    regsub -all -lineanchor {^;([^:]+?:) *} $thebody {*\1*  } thebody
    

    # Bold and Italics - need to include newlines
    regsub -all {'''(.+?)'''} $thebody {*\1*} thebody
    regsub -all {''(.+?)''} $thebody {*\1*} thebody

    # Tables
    regsub -all {\|\|} $thebody {|} thebody

    # Code lines
    regsub -all -lineanchor {((^  +[^\n]+\n)+)} $thebody {{noformat}
\1{noformat}
} thebody

    # Indented Text (leading colon) - just make 2 spaces - do this after handling code lines
    regsub -all -lineanchor {^:} $thebody {  } thebody

    # Horizontal lines - four dashes on a line alone - same in confluence.
    
    # HTML Tags that are supported
    regsub -all {</?(em|i)>} $thebody {_} thebody
    regsub -all {</?(strong|b)>} $thebody {*} thebody
    regsub -all {</?u>} $thebody {+} thebody
    # Next fixed width format
    regsub -all {<tt>(.+?)</tt>} $thebody {\{\{\1\}\}} thebody
    # The special tags nowiki, code and pre are different in OddMuse but will/can all be translated as
    # the same {{}} construct in Confluence because Confluence respects inserted line breaks anyway.
    regsub -all {<nowiki>(.+?)</nowiki>}  $thebody {\{\{\1\}\}} thebody
    regsub -all {<code>(.+?)</code>} $thebody {\{\{\1\}\}} thebody
    regsub -all {<pre>(.+?)</pre>}  $thebody {\{\{\1\}\}} thebody

    return $thebody
}

# Steps through the files of OddMuse, converts the content via a call to convert_body and then writes the results to a similarly named file with a .txt extension.
proc ExtractionFromOddmuseFiles {} {
    global gbl
    # the following is Control-^ times two, it is the delimiter of choice for OddMuse database files
    set doohicky 

    # Step through the directories
    foreach dir1 [glob $gbl(wikisource)] {
	# And each file in each directory
	foreach filename [glob ${dir1}/*.db] {
	    set fp [open $filename r]
	    set data [read $fp]
	    close $fp
	    
	    if { ! [regexp "${doohicky}3text${doohicky}3\(.*?\)${doohicky}3" $data match thebody] } {
		puts "file: \"$filename\" had a format\n which didn't match my expectations"
	    } else {
		set thebody [convert_body $thebody]
		set ofilename \
		    "${gbl(work)}/[file rootname [file tail $filename]].txt"
		set fp [open $ofilename w]
		puts $fp $thebody
		close $fp
	    }
	}
    }
    puts "Scan complete"
}

# MAIN LINE

ExtractionFromOddmuseFiles
# exit