3 # This tool is copyright (c) 2005, Martin Langhoff.
4 # It is released under the Gnu Public License, version 2.
6 # The basic idea is to walk the output of tla abrowse,
7 # fetch the changesets and apply them.
12 git-archimport [ -h ] [ -v ] [ -T ] [ -t tempdir ] <archive>/<branch> [ <archive>/<branch> ]
14 Imports a project from one or more Arch repositories. It will follow branches
15 and repositories within the namespaces defined by the <archive/branch>
16 parameters suppplied. If it cannot find the remote branch a merge comes from
17 it will just import it as a regular commit. If it can find it, it will mark it
18 as a merge whenever possible.
20 See man (1) git-archimport for more details.
24 - create tag objects instead of ref tags
25 - audit shell-escaping of filenames
26 - hide our private tags somewhere smarter
27 - find a way to make "cat *patches | patch" safe even when patchfiles are missing newlines
31 Add print in front of the shell commands invoked via backticks.
35 There are several places where Arch and git terminology are intermixed
36 and potentially confused.
38 The notion of a "branch" in git is approximately equivalent to
39 a "archive/category--branch--version" in Arch. Also, it should be noted
40 that the "--branch" portion of "archive/category--branch--version" is really
41 optional in Arch although not many people (nor tools!) seem to know this.
42 This means that "archive/category--version" is also a valid "branch"
45 We always refer to Arch names by their fully qualified variant (which
46 means the "archive" name is prefixed.
48 For people unfamiliar with Arch, an "archive" is the term for "repository",
49 and can contain multiple, unrelated branches.
57 use File::Temp qw(tempfile tempdir);
58 use File::Path qw(mkpath);
59 use File::Basename qw(basename dirname);
60 use String::ShellQuote;
64 use POSIX qw(strftime dup2);
65 use Data::Dumper qw/ Dumper /;
68 $SIG{'PIPE'}="IGNORE";
71 my $git_dir = $ENV{"GIT_DIR"} || ".git";
72 $ENV{"GIT_DIR"} = $git_dir;
73 my $ptag_dir = "$git_dir/archimport/tags";
75 our($opt_h,$opt_v, $opt_T,
80 Usage: ${\basename $0} # fetch/update GIT from Arch
81 [ -h ] [ -v ] [ -T ] [ -t tempdir ]
82 repository/arch-branch [ repository/arch-branch] ...
87 getopts("Thvt:") or usage();
90 @ARGV >= 1 or usage();
91 my @arch_roots = @ARGV;
93 my ($tmpdir, $tmpdirname) = tempdir('git-archimport-XXXXXX', TMPDIR => 1, CLEANUP => 1);
94 my $tmp = $opt_t || 1;
95 $tmp = tempdir('git-archimport-XXXXXX', TMPDIR => 1, CLEANUP => 1);
96 $opt_v && print "+ Using $tmp as temporary directory\n";
98 my @psets = (); # the collection
99 my %psets = (); # the collection, by name
101 my %rptags = (); # my reverse private tags
102 # to map a SHA1 to a commitid
104 foreach my $root (@arch_roots) {
105 my ($arepo, $abranch) = split(m!/!, $root);
106 open ABROWSE, "tla abrowse -f -A $arepo --desc --merges $abranch |"
107 or die "Problems with tla abrowse: $!";
109 my %ps = (); # the current one
116 # first record padded w 8 spaces
119 # store the record we just captured
121 my %temp = %ps; # break references
122 push (@psets, \%temp);
123 $psets{$temp{id}} = \%temp;
127 my ($id, $type) = split(m/\s{3}/, $_);
132 if ($type =~ m/^\(simple changeset\)/) {
134 } elsif ($type eq '(initial import)') {
136 } elsif ($type =~ m/^\(tag revision of (.+)\)/) {
140 warn "Unknown type $type";
146 # 10 leading spaces or more
147 # indicate commit metadata
150 if ($lastseen eq 'id' && m/^\d{4}-\d{2}-\d{2}/) {
152 my ($date, $authoremail) = split(m/\s{2,}/, $_);
154 $ps{date} =~ s/\bGMT$//; # strip off trailign GMT
155 if ($ps{date} =~ m/\b\w+$/) {
156 warn 'Arch dates not in GMT?! - imported dates will be wrong';
159 $authoremail =~ m/^(.+)\s(\S+)$/;
165 } elsif ($lastseen eq 'date') {
166 # the only hint is position
167 # subject is after date
171 } elsif ($lastseen eq 'subj' && $_ eq 'merges in:') {
173 $lastseen = 'merges';
175 } elsif ($lastseen eq 'merges' && s/^\s{2}//) {
176 push (@{$ps{merges}}, $_);
178 warn 'more metadata after merges!?';
185 my %temp = %ps; # break references
186 push (@psets, \%temp);
187 $psets{ $temp{id} } = \%temp;
191 } # end foreach $root
193 ## Order patches by time
194 @psets = sort {$a->{date}.$b->{id} cmp $b->{date}.$b->{id}} @psets;
196 #print Dumper \@psets;
199 ## TODO cleanup irrelevant patches
200 ## and put an initial import
203 unless (-d $git_dir) { # initial import
204 if ($psets[0]{type} eq 'i' || $psets[0]{type} eq 't') {
205 print "Starting import from $psets[0]{id}\n";
210 die "Need to start from an import or a tag -- cannot use $psets[0]{id}";
212 } else { # progressing an import
214 opendir(DIR, "$git_dir/archimport/tags")
215 || die "can't opendir: $!";
216 while (my $file = readdir(DIR)) {
217 # skip non-interesting-files
218 next unless -f "$ptag_dir/$file";
220 # convert first '--' to '/' from old git-archimport to use
221 # as an archivename/c--b--v private tag
225 print STDERR "converting old tag $oldfile to $file\n";
226 rename("$ptag_dir/$oldfile", "$ptag_dir/$file") or die $!;
228 my $sha = ptag($file);
230 $rptags{$sha} = $file;
236 # extract the Arch repository name (Arch "archive" in Arch-speak)
237 sub extract_reponame {
238 my $fq_cvbr = shift; # archivename/[[[[category]branch]version]revision]
239 return (split(/\//, $fq_cvbr))[0];
242 sub extract_versionname {
244 $name =~ s/--(?:patch|version(?:fix)?|base)-\d+$//;
248 # convert a fully-qualified revision or version to a unique dirname:
249 # normalperson@yhbt.net-05/mpd--uclinux--1--patch-2
250 # becomes: normalperson@yhbt.net-05,mpd--uclinux--1
252 # the git notion of a branch is closer to
253 # archive/category--branch--version than archive/category--branch, so we
254 # use this to convert to git branch names.
255 # Also, keep archive names but replace '/' with ',' since it won't require
256 # subdirectories, and is safer than swapping '--' which could confuse
257 # reverse-mapping when dealing with bastard branches that
258 # are just archive/category--version (no --branch)
260 my $revision = shift;
261 my $name = extract_versionname($revision);
266 *git_branchname = *tree_dirname;
269 foreach my $ps (@psets) {
270 $ps->{branch} = git_branchname($ps->{id});
273 # ensure we have a clean state
275 if (`git diff-files`) {
276 die "Unclean tree when about to process $ps->{id} " .
277 " - did we fail to commit cleanly before?";
282 # skip commits already in repo
284 if (ptag($ps->{id})) {
285 $opt_v && print " * Skipping already imported: $ps->{id}\n";
289 print " * Starting to work on $ps->{id}\n";
292 # create the branch if needed
294 if ($ps->{type} eq 'i' && !$import) {
295 die "Should not have more than one 'Initial import' per GIT import: $ps->{id}";
298 unless ($import) { # skip for import
299 if ( -e "$git_dir/refs/heads/$ps->{branch}") {
300 # we know about this branch
301 `git checkout $ps->{branch}`;
303 # new branch! we need to verify a few things
304 die "Branch on a non-tag!" unless $ps->{type} eq 't';
305 my $branchpoint = ptag($ps->{tag});
306 die "Tagging from unknown id unsupported: $ps->{tag}"
309 # find where we are supposed to branch from
310 `git checkout -b $ps->{branch} $branchpoint`;
312 # If we trust Arch with the fact that this is just
313 # a tag, and it does not affect the state of the tree
314 # then we just tag and move on
315 tag($ps->{id}, $branchpoint);
316 ptag($ps->{id}, $branchpoint);
317 print " * Tagged $ps->{id} at $branchpoint\n";
324 # Apply the import/changeset/merge into the working tree
326 if ($ps->{type} eq 'i' || $ps->{type} eq 't') {
327 apply_import($ps) or die $!;
329 } elsif ($ps->{type} eq 's') {
334 # prepare update git's index, based on what arch knows
335 # about the pset, resolve parents, etc
339 my $commitlog = `tla cat-archive-log -A $ps->{repo} $ps->{id}`;
340 die "Error in cat-archive-log: $!" if $?;
342 # parselog will git-add/rm files
343 # and generally prepare things for the commit
344 # NOTE: parselog will shell-quote filenames!
345 my ($sum, $msg, $add, $del, $mod, $ren) = parselog($commitlog);
346 my $logmessage = "$sum\n$msg";
349 # imports don't give us good info
350 # on added files. Shame on them
351 if ($ps->{type} eq 'i' || $ps->{type} eq 't') {
352 `find . -type f -print0 | grep -zv '^./$git_dir' | xargs -0 -l100 git-update-index --add`;
353 `git-ls-files --deleted -z | xargs --no-run-if-empty -0 -l100 git-update-index --remove`;
358 my @slice = splice(@$add, 0, 100);
359 my $slice = join(' ', @slice);
360 `git-update-index --add $slice`;
361 die "Error in git-update-index --add: $!" if $?;
365 foreach my $file (@$del) {
366 unlink $file or die "Problems deleting $file : $!";
369 my @slice = splice(@$del, 0, 100);
370 my $slice = join(' ', @slice);
371 `git-update-index --remove $slice`;
372 die "Error in git-update-index --remove: $!" if $?;
375 if (@$ren) { # renamed
377 die "Odd number of entries in rename!?";
381 my $from = pop @$ren;
384 unless (-d dirname($to)) {
385 mkpath(dirname($to)); # will die on err
387 #print "moving $from $to";
389 die "Error renaming $from $to : $!" if $?;
390 `git-update-index --remove $from`;
391 die "Error in git-update-index --remove: $!" if $?;
392 `git-update-index --add $to`;
393 die "Error in git-update-index --add: $!" if $?;
397 if (@$mod) { # must be _after_ renames
399 my @slice = splice(@$mod, 0, 100);
400 my $slice = join(' ', @slice);
401 `git-update-index $slice`;
402 die "Error in git-update-index: $!" if $?;
406 # warn "errors when running git-update-index! $!";
407 $tree = `git-write-tree`;
408 die "cannot write tree $!" if $?;
416 if ( -e "$git_dir/refs/heads/$ps->{branch}") {
417 if (open HEAD, "<$git_dir/refs/heads/$ps->{branch}") {
423 if ($ps->{type} eq 's') {
424 warn "Could not find the right head for the branch $ps->{branch}";
430 push @par, find_parents($ps);
432 my $par = join (' ', @par);
435 # Commit, tag and clean state
438 $ENV{GIT_AUTHOR_NAME} = $ps->{author};
439 $ENV{GIT_AUTHOR_EMAIL} = $ps->{email};
440 $ENV{GIT_AUTHOR_DATE} = $ps->{date};
441 $ENV{GIT_COMMITTER_NAME} = $ps->{author};
442 $ENV{GIT_COMMITTER_EMAIL} = $ps->{email};
443 $ENV{GIT_COMMITTER_DATE} = $ps->{date};
445 my ($pid, $commit_rh, $commit_wh);
446 $commit_rh = 'commit_rh';
447 $commit_wh = 'commit_wh';
449 $pid = open2(*READER, *WRITER, "git-commit-tree $tree $par")
451 print WRITER $logmessage; # write
453 my $commitid = <READER>; # read
456 waitpid $pid,0; # close;
458 if (length $commitid != 40) {
459 die "Something went wrong with the commit! $! $commitid";
464 open HEAD, ">$git_dir/refs/heads/$ps->{branch}";
465 print HEAD $commitid;
467 system('git-update-ref', 'HEAD', "$ps->{branch}");
470 ptag($ps->{id}, $commitid); # private tag
471 if ($opt_T || $ps->{type} eq 't' || $ps->{type} eq 'i') {
472 tag($ps->{id}, $commitid);
474 print " * Committed $ps->{id}\n";
475 print " + tree $tree\n";
476 print " + commit $commitid\n";
477 $opt_v && print " + commit date is $ps->{date} \n";
478 $opt_v && print " + parents: $par \n";
483 my $bname = git_branchname($ps->{id});
487 `tla get -s --no-pristine -A $ps->{repo} $ps->{id} $tmp/import`;
488 die "Cannot get import: $!" if $?;
489 `rsync -v --archive --delete --exclude '$git_dir' --exclude '.arch-ids' --exclude '{arch}' $tmp/import/* ./`;
490 die "Cannot rsync import:$!" if $?;
492 `rm -fr $tmp/import`;
493 die "Cannot remove tempdir: $!" if $?;
505 `tla get-changeset -A $ps->{repo} $ps->{id} $tmp/changeset`;
506 die "Cannot get changeset: $!" if $?;
509 if (`find $tmp/changeset/patches -type f -name '*.patch'`) {
510 # this can be sped up considerably by doing
511 # (find | xargs cat) | patch
512 # but that cna get mucked up by patches
513 # with missing trailing newlines or the standard
514 # 'missing newline' flag in the patch - possibly
515 # produced with an old/buggy diff.
516 # slow and safe, we invoke patch once per patchfile
517 `find $tmp/changeset/patches -type f -name '*.patch' -print0 | grep -zv '{arch}' | xargs -iFILE -0 --no-run-if-empty patch -p1 --forward -iFILE`;
518 die "Problem applying patches! $!" if $?;
521 # apply changed binary files
522 if (my @modified = `find $tmp/changeset/patches -type f -name '*.modified'`) {
523 foreach my $mod (@modified) {
526 $orig =~ s/\.modified$//; # lazy
527 $orig =~ s!^\Q$tmp\E/changeset/patches/!!;
528 #print "rsync -p '$mod' '$orig'";
529 `rsync -p $mod ./$orig`;
530 die "Problem applying binary changes! $!" if $?;
535 `rsync --archive --exclude '$git_dir' --exclude '.arch-ids' --exclude '{arch}' $tmp/changeset/new-files-archive/* ./`;
537 # deleted files are hinted from the commitlog processing
539 `rm -fr $tmp/changeset`;
544 # A log entry looks like
545 # Revision: moodle-org--moodle--1.3.3--patch-15
546 # Archive: arch-eduforge@catalyst.net.nz--2004
547 # Creator: Penny Leach <penny@catalyst.net.nz>
548 # Date: Wed May 25 14:15:34 NZST 2005
549 # Standard-date: 2005-05-25 02:15:34 GMT
550 # New-files: lang/de/.arch-ids/block_glossary_random.php.id
551 # lang/de/.arch-ids/block_html.php.id
552 # New-directories: lang/de/help/questionnaire
553 # lang/de/help/questionnaire/.arch-ids
554 # Renamed-files: .arch-ids/db_sears.sql.id db/.arch-ids/db_sears.sql.id
555 # db_sears.sql db/db_sears.sql
556 # Removed-files: lang/be/docs/.arch-ids/release.html.id
557 # lang/be/docs/.arch-ids/releaseold.html.id
558 # Modified-files: admin/cron.php admin/delete.php
559 # admin/editor.html backup/lib.php backup/restore.php
560 # New-patches: arch-eduforge@catalyst.net.nz--2004/moodle-org--moodle--1.3.3--patch-15
561 # Summary: Updating to latest from MOODLE_14_STABLE (1.4.5+)
564 # Updating yadda tadda tadda madda
569 my (@add, @del, @mod, @ren, @kw, $sum, $msg );
571 if ($log =~ m/(?:\n|^)New-files:(.*?)(?=\n\w)/s ) {
573 @add = split(m/\s+/s, $files);
576 if ($log =~ m/(?:\n|^)Removed-files:(.*?)(?=\n\w)/s ) {
578 @del = split(m/\s+/s, $files);
581 if ($log =~ m/(?:\n|^)Modified-files:(.*?)(?=\n\w)/s ) {
583 @mod = split(m/\s+/s, $files);
586 if ($log =~ m/(?:\n|^)Renamed-files:(.*?)(?=\n\w)/s ) {
588 @ren = split(m/\s+/s, $files);
592 if ($log =~ m/^Summary:(.+?)$/m ) {
599 if ($log =~ m/\n\n(.+)$/s) {
607 foreach my $ref ( (\@add, \@del, \@mod, \@ren) ) {
609 while (my $t = pop @$ref) {
610 next unless length ($t);
611 next if $t =~ m!\{arch\}/!;
612 next if $t =~ m!\.arch-ids/!;
613 next if $t =~ m!\.arch-inventory$!;
614 # tla cat-archive-log will give us filenames with spaces as file\(sp)name - why?
615 # we can assume that any filename with \ indicates some pika escaping that we want to get rid of.
617 $t = `tla escape --unescaped '$t'`;
619 push (@tmp, shell_quote($t));
624 #print Dumper [$sum, $msg, \@add, \@del, \@mod, \@ren];
625 return ($sum, $msg, \@add, \@del, \@mod, \@ren);
630 my ($tag, $commit) = @_;
632 # don't use subdirs for tags yet, it could screw up other porcelains
636 open(C,">","$git_dir/refs/tags/$tag")
637 or die "Cannot create tag $tag: $!\n";
639 or die "Cannot write tag $tag: $!\n";
641 or die "Cannot write tag $tag: $!\n";
642 print " * Created tag '$tag' on '$commit'\n" if $opt_v;
644 open(C,"<","$git_dir/refs/tags/$tag")
645 or die "Cannot read tag $tag: $!\n";
648 die "Error reading tag $tag: $!\n" unless length $commit == 40;
650 or die "Cannot read tag $tag: $!\n";
655 # write/read a private tag
656 # reads fail softly if the tag isn't there
658 my ($tag, $commit) = @_;
660 # don't use subdirs for tags yet, it could screw up other porcelains
663 my $tag_file = "$ptag_dir/$tag";
664 my $tag_branch_dir = dirname($tag_file);
665 mkpath($tag_branch_dir) unless (-d $tag_branch_dir);
667 if ($commit) { # write
668 open(C,">",$tag_file)
669 or die "Cannot create tag $tag: $!\n";
671 or die "Cannot write tag $tag: $!\n";
673 or die "Cannot write tag $tag: $!\n";
674 $rptags{$commit} = $tag
675 unless $tag =~ m/--base-0$/;
677 # if the tag isn't there, return 0
678 unless ( -s $tag_file) {
681 open(C,"<",$tag_file)
682 or die "Cannot read tag $tag: $!\n";
685 die "Error reading tag $tag: $!\n" unless length $commit == 40;
687 or die "Cannot read tag $tag: $!\n";
688 unless (defined $rptags{$commit}) {
689 $rptags{$commit} = $tag;
697 # Identify what branches are merging into me
698 # and whether we are fully merged
699 # git-merge-base <headsha> <headsha> should tell
700 # me what the base of the merge should be
704 my %branches; # holds an arrayref per branch
705 # the arrayref contains a list of
706 # merged patches between the base
707 # of the merge and the current head
709 my @parents; # parents found for this commit
711 # simple loop to split the merges
713 foreach my $merge (@{$ps->{merges}}) {
714 my $branch = git_branchname($merge);
715 unless (defined $branches{$branch} ){
716 $branches{$branch} = [];
718 push @{$branches{$branch}}, $merge;
722 # foreach branch find a merge base and walk it to the
723 # head where we are, collecting the merged patchsets that
724 # Arch has recorded. Keep that in @have
725 # Compare that with the commits on the other branch
726 # between merge-base and the tip of the branch (@need)
727 # and see if we have a series of consecutive patches
728 # starting from the merge base. The tip of the series
729 # of consecutive patches merged is our new parent for
732 foreach my $branch (keys %branches) {
734 # check that we actually know about the branch
735 next unless -e "$git_dir/refs/heads/$branch";
737 my $mergebase = `git-merge-base $branch $ps->{branch}`;
739 # Don't die here, Arch supports one-way cherry-picking
740 # between branches with no common base (or any relationship
742 warn "Cannot find merge base for $branch and $ps->{branch}";
747 # now walk up to the mergepoint collecting what patches we have
748 my $branchtip = git_rev_parse($ps->{branch});
749 my @ancestors = `git-rev-list --merge-order $branchtip ^$mergebase`;
750 my %have; # collected merges this branch has
751 foreach my $merge (@{$ps->{merges}}) {
755 foreach my $par (@ancestors) {
756 $par = commitid2pset($par);
757 if (defined $par->{merges}) {
758 foreach my $merge (@{$par->{merges}}) {
759 $ancestorshave{$merge}=1;
763 # print "++++ Merges in $ps->{id} are....\n";
764 # my @have = sort keys %have; print Dumper(\@have);
766 # merge what we have with what ancestors have
767 %have = (%have, %ancestorshave);
769 # see what the remote branch has - these are the merges we
770 # will want to have in a consecutive series from the mergebase
771 my $otherbranchtip = git_rev_parse($branch);
772 my @needraw = `git-rev-list --merge-order $otherbranchtip ^$mergebase`;
774 foreach my $needps (@needraw) { # get the psets
775 $needps = commitid2pset($needps);
776 # git-rev-list will also
777 # list commits merged in via earlier
778 # merges. we are only interested in commits
779 # from the branch we're looking at
780 if ($branch eq $needps->{branch}) {
781 push @need, $needps->{id};
785 # print "++++ Merges from $branch we want are....\n";
786 # print Dumper(\@need);
789 while (my $needed_commit = pop @need) {
790 if ($have{$needed_commit}) {
791 $newparent = $needed_commit;
793 last; # break out of the while
797 push @parents, $newparent;
801 } # end foreach branch
803 # prune redundant parents
805 foreach my $p (@parents) {
808 foreach my $p (@parents) {
809 next unless exists $psets{$p}{merges};
810 next unless ref $psets{$p}{merges};
811 my @merges = @{$psets{$p}{merges}};
812 foreach my $merge (@merges) {
813 if ($parents{$merge}) {
814 delete $parents{$merge};
818 @parents = keys %parents;
819 @parents = map { " -p " . ptag($_) } @parents;
825 my $val = `git-rev-parse $name`;
826 die "Error: git-rev-parse $name" if $?;
831 # resolve a SHA1 to a known patchset
833 my $commitid = shift;
835 my $name = $rptags{$commitid}
836 || die "Cannot find reverse tag mapping for $commitid";
838 my $ps = $psets{$name}
839 || (print Dumper(sort keys %psets)) && die "Cannot find patchset for $name";