Importing pbwiki to mediawiki
Jump to navigation
Jump to search
To move the old pbwiki to MediaWiki the .zip backup file was exported into a directory of files which had a perl script (below) run over it to create a MediaWiki export/import xml file. One of the biggest problems was bad html markup in the text.
It's not perfect by any means, but gives somewhere to work from.
#!/usr/bin/perl # pbwiki to media wiki conversion script # Copyright (C) 2005 ekes -at- aktivix.org.uk # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. use strict; my $dir = '/tmp/knowledgelab'; my $exportfile = '/tmp/knowledgemw.xml'; my $present = '2005-11-10T00:00:00Z'; my $user = 'pbwikiImport'; my @pages; my %revisions; sub convertWiki { my ( $date, $page ) = @_; my ( $line, $text); open (WIKI, '<', $page); while ($line = <WIKI>) { $line =~ s/\\'/'/g; $line =~ s/\\"/"/g; $line =~ s/\r\n/\n/g; # Windows zipfile $line =~ s/&/&/g; $line =~ s/<CENTER>//gi; # ARGH too many broken $line =~ s/<\/CENTER>//gi; # unclosed etc for xml $line =~ s/<br>/<br \/>/g; $line =~ s/^!!!(.*)/===$1===/; $line =~ s/^!!(.*)/==$1==/; $line =~ s/^!(.*)/=$1=/; $line =~ s/^---/----/; $line =~ s/\*\*(.*)\*\*/'''$1'''/g; $line =~ s/__(.*)__/<u>$1<\/u>/g; # this substitution might be more accurate: # (I didn't want to change your script since I'm not 100% sure of my new regex -DaveBrondsema) # $line =~ s/(\s)-(\S.+)-([\s\$])/$1<s>$2<\/s>$3/g; $line =~ s/\s -(\S.*)- \s/<strike>$1<\/strike>/g; # these 2 substitutions might be more accurate more of the time (it's not completely perfect: # (I didn't want to change your script since I'm not 100% sure of my new regex -DaveBrondsema) # $line =~ s/\[([^:]*)\]/[[$1]]/g; # $line =~ s/\[(.*)\|(.*)\]/[$1 $2]/g; $line =~ s/\[(\w+:.*)\]/[[$1]]/g; $line =~ s/\[\[(.*)\|(.*)\]\]/[[$1 $2]]/g; $line =~ s/\[\[(:?\S+(jpg|png))\]\]/[[image:$1]]/g; $line =~ s/\s~([A-Z][a-z0-9_']+[A-Z][a-z0-9_']+)/ $1/g; # This will make links out of 2-hump CamelCaps words $line =~ s/\s([A-Z][a-z0-9_']+[A-Z][a-z0-9_']+)/ [[$1]]/g; $line =~ s/\[\[http:\/\/knowledgelab\.pbwiki\.com\/index.php\?wiki=(\S+)\s/\[\[$1\|/gi; $text .= $line; } close WIKI; return "<revision>\n" ."<timestamp>$date</timestamp>\n" ."<contributor><username>$user</username></contributor>\n" ."<text>\n$text\n</text>" ."</revision>\n"; } opendir (DIR, $dir) or die "Can't open $dir: $!"; while (defined (my $file = readdir DIR)) { next if $file =~ /^\.\.?$/; my @page = ( $file =~ /([^.]+)\.?/g ); $pages[++$#pages] = $page[0] if @page == 1; $revisions{$page[0]}[@{$revisions{$page[0]}}] = $page[1] if @page == 2; } open (XMLFILE, ">", $exportfile) or die "Can't open $exportfile to write: $!"; print XMLFILE "<mediawiki version=\"0.1\" xml:lang=\"en\">\n"; for my $page (@pages) { my $output; $output = "<page>\n" ."<title>$page</title>\n"; $output .= convertWiki( $present, "$dir/$page" ); for my $revision (@{$revisions{$page}}) { my @t = ( $revision =~ /(\d+)-(\d+)-(\d+)-(\d+)-(\d+)/ ); my $timestamp = $t[0].'-'.$t[1].'-'.$t[2].'T'.$t[3].':'.$t[4].':00Z'; $output .= convertWiki( $timestamp, "$dir/$page.$revision" ); } $output .= "</page>\n"; print XMLFILE $output; } print XMLFILE "</mediawiki>\n"; close XMLFILE; print "done\n";