Importing pbwiki to mediawiki

From KnowledgeLab
Jump to: navigation, search

To move the old pbwiki to MediaWiki the .zip backup file was exported into a directory of files which had a perl script (below) run over it to create a MediaWiki export/import xml file. One of the biggest problems was bad html markup in the text.

It's not perfect by any means, but gives somewhere to work from.

#!/usr/bin/perl

# pbwiki to media wiki conversion script
# Copyright (C) 2005 ekes -at- aktivix.org.uk 

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

use strict;

my $dir = '/tmp/knowledgelab';
my $exportfile = '/tmp/knowledgemw.xml';
my $present = '2005-11-10T00:00:00Z';
my $user = 'pbwikiImport';

my @pages;
my %revisions;

sub convertWiki {
	my ( $date, $page ) = @_;
	my ( $line, $text);

	open (WIKI, '<', $page);
	while ($line = <WIKI>) {
		$line =~ s/\\'/'/g;
		$line =~ s/\\"/"/g;
		$line =~ s/\r\n/\n/g; # Windows zipfile

		$line =~ s/&/&/g;
		$line =~ s/<CENTER>//gi;    # ARGH too many broken
		$line =~ s/<\/CENTER>//gi;  # unclosed etc for xml
		$line =~ s/<br>/<br \/>/g;

		$line =~ s/^!!!(.*)/===$1===/;
		$line =~ s/^!!(.*)/==$1==/;
		$line =~ s/^!(.*)/=$1=/;

		$line =~ s/^---/----/;

		$line =~ s/\*\*(.*)\*\*/'''$1'''/g;
		$line =~ s/__(.*)__/<u>$1<\/u>/g;
		# this substitution might be more accurate:
		# (I didn't want to change your script since I'm not 100% sure of my new regex -DaveBrondsema)
		# $line =~ s/(\s)-(\S.+)-([\s\$])/$1<s>$2<\/s>$3/g;
		$line =~ s/\s -(\S.*)- \s/<strike>$1<\/strike>/g;

		# these 2 substitutions might be more accurate more of the time (it's not completely perfect:
		# (I didn't want to change your script since I'm not 100% sure of my new regex -DaveBrondsema)
		# $line =~ s/\[([^:]*)\]/[[$1]]/g;
		# $line =~ s/\[(.*)\|(.*)\]/[$1 $2]/g;

		$line =~ s/\[(\w+:.*)\]/[[$1]]/g;
		$line =~ s/\[\[(.*)\|(.*)\]\]/[[$1 $2]]/g;

		$line =~ s/\[\[(:?\S+(jpg|png))\]\]/[[image:$1]]/g;
		$line =~ s/\s~([A-Z][a-z0-9_']+[A-Z][a-z0-9_']+)/ $1/g;
		# This will make links out of 2-hump CamelCaps words
		$line =~ s/\s([A-Z][a-z0-9_']+[A-Z][a-z0-9_']+)/ [[$1]]/g;
		$line =~ s/\[\[http:\/\/knowledgelab\.pbwiki\.com\/index.php\?wiki=(\S+)\s/\[\[$1\|/gi;
		$text .= $line;
	}
	close WIKI;

	return "<revision>\n"
               ."<timestamp>$date</timestamp>\n"
	       ."<contributor><username>$user</username></contributor>\n"
	       ."<text>\n$text\n</text>"
	       ."</revision>\n";
}

opendir (DIR, $dir) or die "Can't open $dir: $!";
while (defined (my $file = readdir DIR)) {
	next if $file =~ /^\.\.?$/;

	my @page = ( $file =~ /([^.]+)\.?/g );
	$pages[++$#pages] = $page[0] if @page == 1;
	$revisions{$page[0]}[@{$revisions{$page[0]}}] = $page[1] if @page == 2;

}

open (XMLFILE, ">", $exportfile) or die "Can't open $exportfile to write: $!";
print XMLFILE "<mediawiki version=\"0.1\" xml:lang=\"en\">\n";

for my $page (@pages) {
	my $output;

	$output = "<page>\n"
                 ."<title>$page</title>\n";
	
	$output .= convertWiki( $present, "$dir/$page" );
	
	for my $revision (@{$revisions{$page}}) {
		my @t = ( $revision =~ /(\d+)-(\d+)-(\d+)-(\d+)-(\d+)/ );
		my $timestamp = $t[0].'-'.$t[1].'-'.$t[2].'T'.$t[3].':'.$t[4].':00Z';
		$output .= convertWiki( $timestamp, "$dir/$page.$revision" );
	}
	
	$output .= "</page>\n";
	print XMLFILE $output;
}

print XMLFILE "</mediawiki>\n";
close XMLFILE;

print "done\n";