# HG changeset patch # User ali@yendor.vm.bytemark.co.uk # Date 1163665826 0 # Node ID ae7b3fa753dcf20dd8e8ab6a356b73a3a3621f66 First cut. Distintly raw around the edges: * Assumes it will be running in /home/ali/wk/slashem/web.scripts * Assumes cache directory will be in topdir * No build system (simple compiling and linking against libxml2) * No configure system (eg., tagsoup) * Output XML untested * Doesn't set bugzilla maintainer or exporter * Handling of artifact priorities and resolution is suspect diff -r 000000000000 -r ae7b3fa753dc README.cache --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.cache Thu Nov 16 08:30:26 2006 +0000 @@ -0,0 +1,19 @@ +The cache directory contains the following: + +sf/attachments/ + Raw attachments as downloaded from sourceforge by sf2bz + +sf/artifacts//.html + Tagsoup detailed artifacts as downloaded from sourceforge by sf2bz + +sf/users/.html + Tagsoup user profiles as downloaded from sourceforge by sf2bz + +attachments/.xml + Attachments converted to xml by sf2bz + +artifacts//.xhtml + Conversion of detailed artifacts to xhtml by tagsoup + +users/.xhtml + Conversion of user profiles to xhtml by tagsoup diff -r 000000000000 -r ae7b3fa753dc attachment_descs.xsl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/attachment_descs.xsl Thu Nov 16 08:30:26 2006 +0000 @@ -0,0 +1,51 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r ae7b3fa753dc attachment_details.xsl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/attachment_details.xsl Thu Nov 16 08:30:26 2006 +0000 @@ -0,0 +1,70 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r ae7b3fa753dc attachment_gather.xsl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/attachment_gather.xsl Thu Nov 16 08:30:26 2006 +0000 @@ -0,0 +1,54 @@ + + + + + + + + + + + + + + + attachment-details.xml + + + xpointer(/attachments/attachment[attachid=' + + ']/*) + + + + attachment-descriptions.xml + + + xpointer(/attachments/attachment[attachid=' + + ']/desc)element(null) + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r ae7b3fa753dc attachment_post.xsl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/attachment_post.xsl Thu Nov 16 08:30:26 2006 +0000 @@ -0,0 +1,38 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r ae7b3fa753dc bugzilla.xsl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bugzilla.xsl Thu Nov 16 08:30:26 2006 +0000 @@ -0,0 +1,236 @@ + + + + + + + + + + + + : + + + + + + + + + + + + + + + + + + + + + + + + + + + cache/attachments/ + + .xml + + xpointer(/attachment) + + + + + + + + + who + + + + + + + + users.xml + + xpointer(/users/user[.=" + + @users.sourceforge.net"]) + + + + + + + + + + + https://sourceforge.net/tracker/?group_id= + + + + + + + + + + + + + + + + + + + + + + + + + + + +

+ + + + + reporter + + + + + + + assigned_to + + + + + + + + P5 + P4 + P3 + P2 + P1 + + + + + + + + + + + UNCONFIRMED + + ASSIGNED + + + NEW + RESOLVED + + + + + +

+ + + + + + + + + + + + + + + blocker + critical + major + minor + trivial + normal + + + enhancement + + + + + +

+ + + +

+ + + + + + + + + + + + +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r ae7b3fa753dc bugzilla_post.xsl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bugzilla_post.xsl Thu Nov 16 08:30:26 2006 +0000 @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r ae7b3fa753dc get_attached_files.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_attached_files.c Thu Nov 16 08:30:26 2006 +0000 @@ -0,0 +1,148 @@ +/* + * This program parses the XML data provided by sourceforge to extract the + * list of "attached files" (which are not themselves included in the XML + * stream and therefore need to be seperately downloaded for archiving). + */ + +#include +#include +#include +#include +#include + +struct artifact_type { + char *name; + int id; +} artifact_types[] = { + "Bugs", 109746, + "Support Requests", 209746, + "Patches", 309746, + "Feature Requests", 359746, +}; + +#ifndef NO_ELEMS +#define NO_ELEMS(array) (sizeof(array)/sizeof(*(array))) +#endif + +void parse_history(xmlDocPtr doc,const xmlChar *artifact_id, + const xmlChar *artifact_type,xmlNodePtr node) +{ + xmlNodePtr field,cur; + xmlAttrPtr attr; + xmlChar *name=NULL; + xmlChar *field_name=NULL; + xmlChar *text; + int file_id,i; + for(field=node->xmlChildrenNode;field;field=field->next) + if (!xmlStrcmp(field->name,(const xmlChar *)"field")) + { + for(attr=field->properties;attr;attr=attr->next) + if (!xmlStrcmp(attr->name,(const xmlChar *)"name")) + { + name=xmlNodeListGetString(doc,attr->children,1); + if (!xmlStrcmp(name,(const xmlChar *)"field_name")) + field_name=xmlNodeListGetString(doc,field->children,1); + if (!xmlStrcmp(field_name,(const xmlChar *)"File Added") && + !xmlStrcmp(name,(const xmlChar *)"old_value")) + { + text=xmlNodeListGetString(doc,field->children,1); + if (sscanf(text,"%d",&file_id)==1) + { + for(i=NO_ELEMS(artifact_types)-1;i>=0;i--) + if (!strcmp(artifact_types[i].name, + (const char *)artifact_type)) + break; + if (i>=0) + printf("%d %s %d\n", + artifact_types[i].id,artifact_id,file_id); + else + fprintf(stderr, + "Warning: Unknown artifact type \"%s\" - " + "attached file %s ignored\n", + artifact_type,text); + } + xmlFree(text); + } + xmlFree(name); + } + } + if (field_name) + xmlFree(field_name); +} + +void parse_artifact(xmlDocPtr doc,xmlNodePtr node) +{ + xmlNodePtr field,cur; + xmlAttrPtr attr; + xmlChar *name=NULL; + xmlChar *id=NULL; + xmlChar *type=NULL; + xmlChar *text; + for(field=node->xmlChildrenNode;field;field=field->next) + if (!xmlStrcmp(field->name,(const xmlChar *)"field")) + { + for(attr=field->properties;attr;attr=attr->next) + if (!xmlStrcmp(attr->name,(const xmlChar *)"name")) + { + name=xmlNodeListGetString(doc,attr->children,1); + if (!xmlStrcmp(name,(const xmlChar *)"artifact_id")) + id=xmlNodeListGetString(doc,field->children,1); + if (!xmlStrcmp(name,(const xmlChar *)"artifact_type")) + type=xmlNodeListGetString(doc,field->children,1); + } + if (!xmlStrcmp(name,(const xmlChar *)"artifact_history")) + for(cur=field->xmlChildrenNode;cur;cur=cur->next) + if (!xmlStrcmp(cur->name,"history")) + parse_history(doc,id,type,cur); + if (name) + { + xmlFree(name); + name=NULL; + } + } + if (id) + xmlFree(id); + if (type) + xmlFree(type); +} + +int main(int argc,char **argv) +{ + xmlDocPtr doc; + xmlNodePtr cur; + doc=xmlParseFile(argv[1]); + if (!doc) + { + fprintf(stderr,"Document not parsed successfully.\n"); + exit(1); + } + cur=xmlDocGetRootElement(doc); + if (!cur) + { + fprintf(stderr,"Empty document.\n"); + xmlFreeDoc(doc); + exit(1); + } + if (xmlStrcmp(cur->name,(const xmlChar *)"project_export")) + { + fprintf(stderr,"%s does not appear to be a project export document.\n", + argv[1]); + xmlFreeDoc(doc); + exit(1); + } + for(cur=cur->xmlChildrenNode;cur;cur=cur->next) + if (!xmlStrcmp(cur->name,(const xmlChar *)"artifacts")) + break; + if (!cur) + { + fprintf(stderr,"%s does not appear to contain an artifacts node.\n", + argv[1]); + xmlFreeDoc(doc); + exit(1); + } + for(cur=cur->xmlChildrenNode;cur;cur=cur->next) + if (!xmlStrcmp(cur->name,(const xmlChar *)"artifact")) + parse_artifact(doc,cur); + xmlFreeDoc(doc); + exit(0); +} diff -r 000000000000 -r ae7b3fa753dc get_users.xsl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_users.xsl Thu Nov 16 08:30:26 2006 +0000 @@ -0,0 +1,39 @@ + + + + + +

+ + +

+ + + + + + + + + + + + + + + + + @users.sourceforge.net + + + + + + + diff -r 000000000000 -r ae7b3fa753dc sf2bz.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sf2bz.sh Thu Nov 16 08:30:26 2006 +0000 @@ -0,0 +1,109 @@ +#!/bin/sh +if [ $# -ne 2 ]; then + echo "Usage: sf2bz .xml .xml" >&2 + exit 1 +fi +input=$1 +output=$2 +topdir=/home/ali/wk/slashem/web.scripts +cd $topdir +cache_dir=$topdir/cache +tagsoup=tagsoup/tagsoup-1.0.1.jar +#tmpdir=`mktemp -d /tmp/XXXXXXXX` || exit 1 +tmpdir=/tmp/sf2bz; rm -rf $tmpdir; mkdir $tmpdir +mkdir -p $cache_dir/sf/artifacts +# Sourceforge's export is pretty good, but misses a few things. +# Deal with one of these (descriptions for attached files) here. +echo '' > $tmpdir/descriptions.xml +echo '' \ + >> $tmpdir/descriptions.xml +./get_attached_files $input | cut -s "-d " -f 1,2 | sort -n | uniq | \ + while read atid aid; do + mkdir -p $cache_dir/sf/artifacts/$atid + if [ ! -r $cache_dir/sf/artifacts/$atid/$aid.html ]; then + wget -q -O $cache_dir/sf/artifacts/$atid/$aid.html \ + "http://sourceforge.net/tracker/index.php?func=detail&aid=$aid&group_id=9746&atid=$atid" + fi + mkdir -p $cache_dir/artifacts/$atid + if [ ! -r $cache_dir/artifacts/$atid/$aid.xhtml ]; then + java -jar $tagsoup $cache_dir/sf/artifacts/$atid/$aid.html \ + > $cache_dir/artifacts/$atid/$aid.xhtml + fi + echo "" >> $tmpdir/descriptions.xml +done +echo '' >> $tmpdir/descriptions.xml +xsltproc --xinclude attachment_descs.xsl $tmpdir/descriptions.xml \ + > $tmpdir/attachment-descriptions.xml +# Some details of attachments are included in the project export. +xsltproc attachment_details.xsl $input > $tmpdir/attachment-details.xml +# Then convert the attachments to XML (as required by bugzilla) +# pulling in the various bits of information we have gathered. +mkdir -p $cache_dir/sf/attachments $cache_dir/attachments +mkdir -p $tmpdir/jail +./get_attached_files artifacts.xml | while read atid aid file_id; do + if [ ! -r $cache_dir/sf/attachments/$file_id ]; then + wget -N -O $cache_dir/sf/attachments/$file_id "http://sourceforge.net/tracker/download.php?group_id=9746&atid=$atid&file_id=$file_id&aid=$aid" + fi + if [ ! -r $cache_dir/attachments/$file_id.xml ]; then + type=`file -bi $cache_dir/sf/attachments/$file_id | sed 's/;.*//'` + case "$type" in + text/*) + (cd $tmpdir/jail; patch --dry-run -s -f \ + < $cache_dir/sf/attachments/$file_id > /dev/null 2>&1) + if [ $? -lt 2 ]; then + pflag="-p" + else + pflag="" + fi + ;; + *) + pflag="" + ;; + esac + ./xml_attached_file $pflag -i $file_id -t $type \ + $cache_dir/sf/attachments/$file_id > $tmpdir/$file_id.1.xml + xsltproc attachment_gather.xsl $tmpdir/$file_id.1.xml \ + > $tmpdir/$file_id.2.xml + xsltproc --xinclude attachment_post.xsl $tmpdir/$file_id.2.xml \ + > $cache_dir/attachments/$file_id.xml + fi +done +# We can't get hold of descriptions of deleted attachments, so we +# keep a copy of the file in case we need it. Ideally, we'd make sure +# that we only add descriptions, but that's for another day. +cp $tmpdir/attachment-descriptions.xml $topdir +xsltproc get_users.xsl $input > $tmpdir/raw_users.xml +echo '' \ + >> $tmpdir/users.1.xml +for file in $cache_dir/artifacts/*/*.xhtml; do + echo "" >> $tmpdir/users.1.xml +done +echo "" >> $tmpdir/users.1.xml +echo '' >> $tmpdir/users.1.xml +xsltproc --xinclude user_names.xsl $tmpdir/users.1.xml > $tmpdir/users.2.xml +mkdir -p $cache_dir/sf/users $cache_dir/users +echo '' \ + >> $tmpdir/users.3.xml +xsltproc unknown_users.xsl $tmpdir/users.2.xml | while read user; do + case $user in + *@users.sourceforge.net) + user_id=`echo $user | sed 's/@users.sourceforge.net$//'` + if [ ! -r $cache_dir/sf/users/$user_id.html ]; then + wget -N -O $cache_dir/sf/users/$user_id.html \ + "http://sourceforge.net/users/$user_id/" + fi + if [ ! -r $cache_dir/users/$user_id.xhtml ]; then + java -jar $tagsoup $cache_dir/sf/users/$user_id.html \ + > $cache_dir/users/$user_id.xhtml + fi + echo "" \ + >> $tmpdir/users.3.xml + esac +done +echo "" >> $tmpdir/users.3.xml +echo '' >> $tmpdir/users.3.xml +xsltproc --xinclude user_names.xsl $tmpdir/users.3.xml > users.xml +ln -s $cache_dir/attachments $tmpdir +xsltproc bugzilla.xsl $input > artifacts.tmp.xml +xsltproc --xinclude bugzilla_post.xsl artifacts.tmp.xml > $output +#rm -rf $tmpdir artifacts.tmp.xml diff -r 000000000000 -r ae7b3fa753dc sf2bz.xsl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sf2bz.xsl Thu Nov 16 08:30:26 2006 +0000 @@ -0,0 +1,29 @@ + + + + + + + + + + + + - + + - + + + + : + + : + + UTC + + + diff -r 000000000000 -r ae7b3fa753dc tagsoup/SRC --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tagsoup/SRC Thu Nov 16 08:30:26 2006 +0000 @@ -0,0 +1,2 @@ +http://mercury.ccil.org/~cowan/XML/tagsoup/ +http://mercury.ccil.org/~cowan/XML/tagsoup/tagsoup-1.0.1.jar diff -r 000000000000 -r ae7b3fa753dc unknown_users.xsl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/unknown_users.xsl Thu Nov 16 08:30:26 2006 +0000 @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + diff -r 000000000000 -r ae7b3fa753dc user_names.xsl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/user_names.xsl Thu Nov 16 08:30:26 2006 +0000 @@ -0,0 +1,61 @@ + + + + + + + + + + + + + + + + + + + + + + @users.sourceforge.net + + + + + + + + + @users.sourceforge.net + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r ae7b3fa753dc xml_attached_file.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xml_attached_file.c Thu Nov 16 08:30:26 2006 +0000 @@ -0,0 +1,144 @@ +/* + * This program converts an attached file (which has previously been + * downloaded from sourceforge) into bugzilla-style XML. + */ + +#include +#include +#include +#ifdef __linux__ +#include +#endif + +#ifdef __linux__ +static struct option long_options[] = { + {"type", 1, 0, 't'}, + {"id", 1, 0, 'i'}, + {"patch", 0, 0, 'p'}, + {0, 0, 0, 0} +}; + +static char *long_option_str(char opt) +{ + int i; + static char buf[100]; + for(i=0;long_options[i].name;i++) + if (long_options[i].val==opt) + { + sprintf(buf,", --%s",long_options[i].name); + return buf; + } + return ""; +} +#else +#define long_option_str(opt) "" +#endif + +usage() +{ + fprintf(stderr,"Usage: xml_attached_file [OPTION] file\n"); + fprintf(stderr,"\n"); + fprintf(stderr," -t%s=MIME-type MIME type to use\n", + long_option_str('t')); + fprintf(stderr," -i%s=ID Attachment ID to use\n", + long_option_str('i')); + fprintf(stderr," -p%s Mark attachment as a patch\n", + long_option_str('p')); + exit(1); +} + +/* Note that we limit line lengths to 76 characters following RFC 2045 + * (bugzilla uses MIME::Base64). This isn't strictly compliant with RFC 4648. + */ +static void base64_encode(FILE *in,FILE *out) +{ + int ng=0; /* 76 characters == 19 groups */ + size_t nb; + unsigned char bytes[3]; + const char alphabet[64]= + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + while ((nb=fread(bytes,1,3,in))==3) + { + if (ng++>=19) + { + putc('\n',out); + ng=1; + } + putc(alphabet[bytes[0]>>2],out); + putc(alphabet[(bytes[0]<<4|bytes[1]>>4)&0x3F],out); + putc(alphabet[(bytes[1]<<2|bytes[2]>>6)&0x3F],out); + putc(alphabet[bytes[2]&0x3F],out); + } + if (nb) + { + if (ng>=19) + putc('\n',out); + putc(alphabet[bytes[0]>>2],out); + if (nb==2) + { + putc(alphabet[(bytes[0]<<4|bytes[1]>>4)&0x3F],out); + putc(alphabet[bytes[1]<<2&0x3F],out); + } + else + { + putc(alphabet[bytes[0]<<4&0x3F],out); + putc('=',out); + } + putc('=',out); + } +} + +int main(int argc,char **argv) +{ + int c; + FILE *fp; + unsigned long id=0; + int ispatch=0; + char *type=NULL; + for(;;) + { +#ifdef __linux__ + int option_index=0; + c=getopt_long(argc,argv,"t:i:p",long_options,&option_index); +#else + c=getopt(argc,argv,"t:i:p"); +#endif + if (c<0) + break; + switch(c) + { + case 't': + type=optarg; + break; + case 'i': + id=strtoul(optarg,NULL,10); + break; + case 'p': + ispatch=1; + break; + default: + usage(); + } + } + if (optind\n",ispatch); + if (id) + printf(" %lu\n",id); + if (type) + printf(" %s\n",type); + printf(" \n"); + base64_encode(fp,stdout); + printf("\n \n"); + printf("\n"); + exit(0); +}