#! /usr/bin/perl ################################################################ # # Copyright(c) 2006 Whitehead Institute for Biomedical Research. # All Rights Reserved # # Author: K.Walker # # Purpose: Perl Script to populate the Entrez Gene Database # # Date: August 2005 # # Note: This script will download, parse and populate all tables # EXCEPT refSeqSummary. refSeqSummary table is populated by # data downloaded from UCSC. If interested in scripts and/or # protocol, email BaRC. # ################################################################# # These variables are for login access to your mysql database. # Change information as appropriate my $host = "machine_name"; my $db = "name_of_database"; my $user = "user_name"; my $pw = "password_for_user"; # location of script $location_of_script = "/dir/somewhere/"; # where files will be downloaded and unzipped $basic_dir = "/dir/somewhere_new"; # where sql formated files will be saved $SQL = "/dir/somewhere_else"; # End of User variables # Begin perl script # remove possible trailing / from dir variables $SQL =~ s/\/$//;$basic_dir =~ s/\/$//;$location_of_script =~ s/\/$//; # Test to make sure necessary dir exist if (!(-e $location_of_script && -w $location_of_script && -x $location_of_script )) { print "$location_of_script does not exist or isn't readable and writable. Either create dir or change the value of variable location_of_script\n"; exit; } if (!(-e $basic_dir && -w $basic_dir && -x $basic_dir )) { print "$basic_dir does not exist or isn't readable and writable. Either create dir or change the value of variable basic_dir\n"; exit; } if (!(-e $SQL && -w $SQL && -x $SQL )) { print "$SQL does not exist or isn't readable and writable. Either create dir or change the value of variable SQL\n"; exit; } # list of all tables @tables = qw( mim2gene gene2accession gene_history gene_info gene2go gene2pubmed gene2refseq gene2sts gene2unigene generifs_basic interactions tax2name ); # downloaded files - zipped @zip_files = qw( gene_history gene2accession gene_info gene2go gene2pubmed gene2refseq ); # downloaded files - unzipped @download_files = qw( gene2sts gene2unigene mim2gene ); # downloaded files -- RIFs @geneRIF = qw( generifs_basic interactions ); # remove previously downloaded files foreach $old (@tables) { `rm -Rf "$basic_dir/$old" `; } `rm -Rf "$basic_dir/*.txt" `; # Since files are located in multiple location, # Different groups need to treated slightly differently # using unix command wget - download new zip files # rm *.zip file after unzipping foreach $zip (@zip_files) { my $ftp_site = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/" . $zip . ".gz"; my $zip_input_file = "$basic_dir" ."/" . $zip . ".gz"; print "now downloading $zip here: $zip_input_file\n"; `wget -q $ftp_site -O $zip_input_file`; `gunzip $zip_input_file`; `rm -Rf $zip_input_file`; } # using unix command wget - download new files foreach $down (@download_files) { my $ftp_site = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/" . $down; my $zip_input_file = "$basic_dir" . "/" . $down; print "now downloading $down here: $zip_input_file\n"; `wget -q $ftp_site -O $zip_input_file`; } # using unix command wget - download new zip files # rm *.zip file after unzipping foreach $rif (@geneRIF) { my $ftp_site = "ftp://ftp.ncbi.nlm.nih.gov/gene/GeneRIF/" . $rif . ".gz"; my $zip_input_file = "$basic_dir" ."/" . $rif . ".gz"; print "now downloading $rif here: $zip_input_file\n"; `wget -q $ftp_site -O $zip_input_file`; `gunzip $zip_input_file`; `rm -Rf $zip_input_file`; } # using unix command wget - download new zip files # rm *.zip file after unzipping my $ftp_site = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"; my $zip_input_file = $basic_dir ."/" . "taxdump.tar.gz"; print "now downloading taxdump.tar.gz here: $zip_input_file\n"; `wget -q $ftp_site -O $zip_input_file`; `tar -zxf $zip_input_file`; `mv $location_of_script/names.dmp $basic_dir/.`; `rm -f *.dmp gc.prt`; `rm -Rf $zip_input_file`; ####### # # INSERT # ####### # foreach newly downloaded file; # parse each file (ignoring comments), add a primary id, and print out # information (comma separated) to a new *.sql file (named same as $file) # then, using mysqlimport - insert file into the db $line = ''; $primary_id= 0; foreach $file (@tables) { chomp $file; if ($file eq ''){next;} if ($file eq "tax2name") { # Taxonomy is a little different from Entrez Gene # I need to parse out the information that I need from names.dmp $file = "$basic_dir/names.dmp"; open (READ, $file) || die "Can't open $file : $!"; open (TAX, ">$SQL/tax2name") || die "Can't open $SQL/tax2name : $!"; while () { chomp($_); @fields = split(/\t/,$_); if($fields[6] =~ /scientific/i) { $tax_id = $fields[0]; $tax_id =~ s/^s+//; $tax_id =~ s/\s+$//; $name = $fields[2]; $name =~ s/^s+//; $name =~ s/\s+$//; print TAX "$tax_id\t$name\n"; } } `mysqlimport -h $host $db -u $user -p$pw -L -d -v --fields-terminated-by="\t" $SQL/tax2name`; print "inserted new data into: tax2name\n"; next; } else { open (READ, "$basic_dir/$file") || die "Can't open $basic_dir/$file : $!"; open (SQL, ">$SQL/$file") || die "Can't open $SQL/$file $!"; $primary_id =0; while () { chomp($_); if ($_ =~ /^#/) { next; } $primary_id++; @data = split(/\t/,$_); $line = "\'$primary_id\'\t"; for ($i=0; $i< @data; $i++) { $data[$i] =~ s/^\s+//; $data[$i] =~ s/\s+$//; $line .= "\'$data[$i]\'\t"; } $line =~ s/\t$/\n/; print SQL "$line"; } } print "inserting new data into: $file\n"; `mysqlimport -h $host $db -u $user -p$pw -L -d -v --fields-terminated-by="\t" --fields-enclosed-by="'" $SQL/$file`; } close READ; close SQL; close TAX; print "Script has completed. Database should now be populated with data.\n";