#!/usr/bin/perl -w # TODO: I'd like to be able to tell it to get some extra files, by name. # I'm thinking Contents files. It would be really nice if it could pull # a whole directory -- think project/trace, or disks-i386... # TODO: It would probably be cleaner and easier to learn if it took # apt-style lines to tell where to mirror from and what portions to use. =head1 NAME debmirror - Debian partial mirror script, with ftp and package pool support =head1 SYNOPSIS debmirror [options] mirrordir =head1 DESCRIPTION This program downloads and maintains a partial local Debian mirror. It can mirror any combination of architectures, distributions, and sections. Files are transferred by ftp, and package pools are fully supported. It also does locking and updates trace files. To support package pools, this program mirrors in three steps. =over 4 =item 1. download Packages and Sources files First it downloads all Packages and Sources files for the subset of Debian it was instructed to get. =item 2. clean up unknown files Any files and directories on the local mirror that are not in the list are removed. =item 3. download everything else The Packages and Sources files are scanned, to build up a list of all the files they refer to. A few other miscellaneous files are added to the list. Then the program makes sure that each file in the list is present on the local mirror and is up-to-date, using file size (and optionally md5sum) checks. Any necessary files are downloaded. =back =cut sub usage { warn join(" ", @_)."\n" if @_; warn < \$debug, 'debug-ftp' => \$debugftp, 'progress|p:n' => \$progress, 'verbose' => \$progress, 'source!' => \$do_source, 'md5sums|m' => \$check_md5sums, 'passive!' => \$passive, 'host|h=s' => \$host, 'user|u=s' => \$user, 'root|r=s' => \$remoteroot, 'dist|d=s' => \@dists, 'section|s=s' => \@sections, 'arch|a=s' => \@arches, 'adddir=s' => \@extra_dirs, 'cleanup!' => \$cleanup, 'ignore=s' => \@ignores, 'exclude=s' => \@excludes, 'include=s' => \@includes, 'skippackages' => \$skippackages, 'getcontents!' => \$getcontents, 'method|e=s' => \$download_method, 'timeout|t=s' => \$timeout, 'help' => \$help, ) or usage; usage if $help; if (defined($progress) && ($progress == 0)) { $progress = 1024; } $| = 1 if ($progress); # This parameter is so important that it is the only required parameter. my $mirrordir=shift or usage("mirrordir not specified"); # Post-process arrays. Allow commas to seperate values the user entered. # If the user entered nothing, provide defaults. @dists=split(/,/,join(',',@dists)); @dists=qw(unstable) unless @dists; @sections=split(/,/,join(',',@sections)); @sections=qw(main contrib non-free) unless @sections; @arches=split(/,/,join(',',@arches)); @arches=qw(i386) unless @arches; # Display configuration. debug("Mirroring to $mirrordir from $download_method://$user:$host/$remoteroot/"); debug("Arches: ".join(",", @arches)); debug("Dists: ".join(",", @dists)); debug("Sections: ".join(",", @sections)); debug("Including source.") if $do_source; debug("Passive mode on.") if $passive; debug("Checking md5sums.") if $check_md5sums; debug("Will NOT clean up.") unless $cleanup; debug("FTP Hash mark every $progress bytes.") if defined($progress); my $md5; if ($check_md5sums) { eval q{use Digest::MD5; $md5=Digest::MD5->new;}; $md5=Digest::MD5->new; } # Set up mirror directory. make_dir($mirrordir); chdir($mirrordir) or die "chdir $mirrordir: $!"; # Handle the lock file. This is the same method used by official # Debian push mirrors. my $hostname=`hostname -f 2>/dev/null || hostname`; chomp $hostname; my $lockfile="Archive-Update-in-Progress-$hostname"; $files{$lockfile}=0; my $lockmgr = LockFile::Simple->make(-format => "%f/$lockfile", -max => 4300, -delay => 10, -nfs => 1, -warn => 0); $SIG{INT}=sub { $lockmgr->unlock("$mirrordir"); exit 1; }; $SIG{TERM}=sub { $lockmgr->unlock("$mirrordir"); exit 1; }; END { $lockmgr->unlock("$mirrordir") }; if (!$lockmgr->trylock("$mirrordir")) { warn "$mirrordir currently locked\n"; $lockmgr->lock("$mirrordir") or die "$lockfile exists; aborting\n"; } # Register the trace file. my $tracefile="project/trace/$hostname"; $files{$tracefile}=0; my $ftp; my %opts = (Debug => $debugftp, Passive => $passive, Timeout => $timeout); INIT: { $_ = $download_method; /^ftp$/ && do { # Start up ftp. ftp_open(); last INIT; }; /^rsync$/ && do { $remoteroot = "$host:$remoteroot"; if (! ($user eq 'anonymous')) { $remoteroot = "$user\@$remoteroot"; } last INIT; }; usage("unknown download method: $_"); } # Get Packages and Sources files and other miscellany. my (@package_files, @source_files); foreach my $dist (@dists) { foreach my $section (@sections) { foreach my $arch (@arches) { get_packages("dists/$dist/$section/binary-$arch"); } get_sources("dists/$dist/$section/source"); } if ($getcontents) { foreach my $arch (@arches) { next if $arch=~/source/; remote_get(make_name("dists/$dist/Contents-$arch.gz")); $files{"dists/$dist/Contents-$arch.gz"}=0; remote_get(make_name("dists/$dist/Release")); $files{"dists/$dist/Release"}=0; } } } foreach (@extra_dirs) { get_packages($_); get_sources($_); } # Sanity check. I once nuked a mirror because of this.. if (@arches && ! @package_files) { die "Failed to download any Packages files!\n"; } if ($do_source && ! @source_files) { die "Failed to download any Sources files!\n"; } # Parse Packages and Sources files and add to the file list everything therein. { local $/="\n\n"; my ($filename, $size, $md5sum, $directory, $exclude, $include, $architecture); my %arches = map { $_ => 1 } (@arches, "all"); $exclude = "(".join("|", @excludes).")" if @excludes; $include = "(".join("|", @includes).")" if @includes; foreach my $file (@package_files) { my $gunzf = gzopen($file, "rb") or die "$file: $!"; my $line; $_ = ""; while ($gunzf->gzreadline($line) > 0) { $_ .= $line; $line eq "\n" or next; ($filename)=m/^Filename:\s+(.*)/im; ($architecture)=m/^Architecture:\s+(.*)/im; next if (!$arches{$architecture}); if(!(defined($include) && ($filename=~/$include/o))) { next if (defined($exclude) && $filename=~/$exclude/o); } ($size)=m/^Size:\s+(\d+)/im; ($md5sum)=m/^MD5sum:\s+([A-Za-z0-9]+)/im; $files{$filename}=check_file($filename, $size, $md5sum); $_ = ""; } $gunzf->gzclose(); } foreach my $file (@source_files) { my $gunzf = gzopen($file, "rb") or die "$file: $!"; my $line; $_ = ""; while ($gunzf->gzreadline($line) > 0) { $_ .= $line; $line eq "\n" or next; ($directory) = m/^Directory:\s+(.*)/im; while (m/^ ([A-Za-z0-9]{32} .*)/mg) { ($md5sum, $size, $filename)=split(' ', $1, 3); $filename="$directory/$filename"; if(!(defined($include) && ($filename=~/$include/o))) { next if (defined($exclude) && $filename=~/$exclude/o); } $files{$filename}=check_file($filename, $size, $md5sum); } $_ = ""; } $gunzf->gzclose(); } } # Download all files that we need to get. DOWNLOAD: { my @result = sort grep { $files{$_} } keys %files; foreach my $file (@result) { my ($dirname) = $file =~ m:(.*)/:; make_dir($dirname); } $_ = $download_method; # Ftp method /^ftp$/ && do { foreach my $file (@result) { ftp_get($file); } last DOWNLOAD; }; # Rsync method /^rsync$/ && do { my $opt = $progress ? " --progress" : "" . $debug ? " -v" : ""; my ($fh, $rsynctempfile) = tempfile(); print STDERR "filename -- $rsynctempfile\n"; local END { unlink $rsynctempfile; }; { local $, = "\n"; print $fh @result; } close $fh; system ("rsync -az --timeout=$timeout$opt $remoteroot --include-from=$rsynctempfile --exclude='*' $mirrordir"); last DOWNLOAD; }; } # delete unknown files if ($cleanup || $debug) { my $ignore; $ignore = "(".join("|", @ignores).")" if @ignores; # Remove all files in the mirror that we don't know about foreach my $file (`find . -type f`) { chomp $file; $file=~s:^\./::; unless (exists $files{$file} or (defined($ignore) && $file=~/$ignore/o)) { if ($cleanup) { debug("deleting $file"); unlink $file or die "unlink $file: $!"; } else { debug("want to delete $file"); } } } # Remove all empty directories. Not done as part of main cleanup # to prevent race problems with pool download code, which # makes directories.. Sort so they are removable in bottom-up # order. if ($cleanup) { system("find . -type d ! -name . ! -name .. | sort -r | xargs rmdir 2>/dev/null"); } } # Finish up. Write out trace file. if ($download_method eq 'ftp') { ftp_close(); } run_rename(); make_dir("project/trace"); open OUT, ">$tracefile" or die "$tracefile: $!"; print OUT `date -u`; close OUT; exit; # Pass this function a filename, a file size (bytes), and a md5sum (hex). # It will return true if the file needs to be downloaded. sub check_file ($$$) { my ($file, $size, $md5sum) = @_; if (-f $file and $size == -s _) { if ($check_md5sums) { open HANDLE, $file or die "$file $!"; $md5->addfile(*HANDLE); if (!defined($md5sum)) { debug("$file: no md5sum in Packages"); return 0; } elsif ($md5sum eq $md5->hexdigest) { return 0; } } else { # Assume it is ok, w/o md5 check. return 0; } # issue error and unlink so ftp_get will refetch. warn "$file: bad checksum"; unlink $file; } return 1; } sub ftp_open () { $ftp = Net::FTP->new($host, %opts) or die "$@\n"; $ftp->login($user) or die "login failed"; # anonymous $ftp->binary or die "could not set binary mode"; $ftp->cwd($remoteroot) or die "cwd to $remoteroot failed"; $ftp->hash(\*STDOUT, $progress) if ($progress); } sub ftp_close () { $ftp->quit; } sub remote_get (@) { METHOD: { $_ = $download_method; /^ftp$/ && do { return ftp_get(@_); }; /^rsync$/ && do { return rsync_get($_[0]); }; } } # Get a file via ftp, first displaying its filename if progress is on. # I should just be able to subclass Net::Ftp and override the get method, # but it's late. # Taught ftp_get how to resume a transfer using file timestamp. About the # only window for a problem is if a remote FTP server changes the contents # of a file without changing the timestamp. If there is a buggy ftp # transfer --md5sums should clean it up. # Taught ftp_get how to fetch to an alternative filename if specified. sub ftp_get_file (@) { my ($file, $alt_name) = (@_); my $ret = undef; my ($l_size, $l_time) = (stat $file)[7,9]; # get local size, time my ($r_time) = $ftp->mdtm($file); # get remote time # get alt_name size, time my ($a_size, $a_time); if ($alt_name) { ($a_size, $a_time) = (stat "$alt_name")[7,9]; } # If remote host supplies time, then we store that locally if (defined($r_time)) { if (defined($a_time) && ($a_time == $r_time)) { debug("$file resuming $alt_name"); $l_size = $a_size; } elsif (defined($l_time) && ($l_time == $r_time)) { if (defined($alt_name)) { debug("$file copy to $alt_name"); !system("cp -p $file $alt_name") or warn "$file copy failed: $!"; } } else { # remote doesn't match l_time or a_time, $l_size = 0; # pull from beginning } $alt_name = $file if (!defined($alt_name)); if ($debug) { # get remote size my ($r_size) = $ftp->size($file); $r_size = defined($r_size) ? $r_size : "unknown"; # print notice about transfer resume debug("$file resume from $l_size of $r_size") if ($l_size != $r_size); } # install local signal handlers, so that r_time is saved. my $sig; $sig = $SIG{INT}; local $SIG{INT} = sub { utime($r_time, $r_time, $alt_name); &{$sig}(); }; $sig = $SIG{TERM}; local $SIG{TERM} = sub { utime($r_time, $r_time, $alt_name); &{$sig}(); }; print "$file : " if $progress; # print progress $ret = $ftp->get($file, $alt_name, $l_size); utime($r_time, $r_time, $alt_name); } else { $alt_name = $file if (!defined($alt_name)); print "$file : " if $progress; # print progress $ret = $ftp->get($file, $alt_name); } if (!$ret) { warn "$file get failed\n"; } return $ret; } { # these are helper functions to handle reconnect on error my ($done_transfer, $toggled_passive); sub ftp_reconnect () { ftp_close(); # automatically turn on passive mode if no transfers have # been done and it's not on already, to fix NAT'ed hosts if (!defined($done_transfer) and !defined($passive)) { debug("attemping passive option\n"); $opts{Passive} = 1; $toggled_passive = 1; } ftp_open(); } sub ftp_get (@) { my $ret; if (!($ret = ftp_get_file(@_))) { warn "$_[0] first fetch failed, will retry\n"; ftp_reconnect(); if (!($ret = ftp_get_file(@_))) { warn "$_[0] second fetch failed\n"; } } # We could change the following warn into a die and be # annoying, but I think the user will turn on --passive to # quiet this message on their own. warn "You must use the --passive option.\n" if ($toggled_passive && !defined($done_transfer)); # record that a transfer has been done so passive toggle # does not happen. $done_transfer = 1 if ($ret); return $ret; } } { my %rename; # make a new name for some file, return (old, new) name sub make_name($) { my $old = shift; my $new = $old; $new =~ s:(.*)/(.*):$1/.$2.new:; $files{$new}=0; $rename{$old} = $new; return $old, $new; } sub get_name($) { my $old = shift; return $rename{$old}; } sub run_rename() { foreach(keys %rename) { debug("rename $rename{$_} to $_"); rename $rename{$_}, $_; } } } sub rsync_get { my $file=shift; my $opt=""; (my $dirname) = $file =~ m:(.*/):; my @dir= split(/\//, $dirname); for (0..$#dir) { $opt = "$opt --include=" . join('/', @dir[0..$_]) . "/"; } $opt = "$opt --progress" if $progress; $opt = "$opt -v" if $debug; print "$file: " if $progress; system ("rsync -az --timeout=$timeout $opt $remoteroot --include=$file --exclude='*' $mirrordir"); return 1 if $? == 0; } # Get Packages file in the passed subdirectory. sub get_packages { my $subdir=shift; make_dir($subdir); if ($skippackages) { push @package_files, "$subdir/Packages.gz"; } else { remote_get(make_name("$subdir/Packages.gz")) and push @package_files, get_name("$subdir/Packages.gz"); remote_get(make_name("$subdir/Release")); # optional } $files{"$subdir/Packages.gz"}=0; $files{"$subdir/Release"}=0; } # Get Sources file sub get_sources { my $subdir=shift; if ($do_source) { make_dir($subdir); if ($skippackages) { push @source_files, "$subdir/Sources.gz"; } else { remote_get(make_name("$subdir/Sources.gz")) and push @source_files, get_name("$subdir/Sources.gz"); } $files{"$subdir/Sources.gz"}=0; } } # Make a directory including all needed parents. { my %seen; sub make_dir { my $dir=shift; my @parts=split('/', $dir); my $current=''; foreach my $part (@parts) { $current.="$part/"; if (! $seen{$current}) { mkdir ($current, 0755); $seen{$current}=1; } } } } sub debug { print STDERR $0.': '.join(' ', @_)."\n" if $debug; } =head1 COPYRIGHT This program is copyright 2001 by Joey Hess , under the terms of the GNU GPL. The author disclaims any responsibility for any mangling of your system, unexpected bandwidth usage bills, meltdown of the Debian mirror network, etc, that this script may cause. See NO WARRANTY section of GPL. =head1 AUTHOR Joey Hess =head1 MOTTO Waste bandwith -- put a partial mirror on your laptop today! =cut