Merge refs/heads/master from paulus
[git.git] / tools / mailinfo.c
1 /*
2  * Another stupid program, this one parsing the headers of an
3  * email to figure out authorship and subject
4  */
5 #define _GNU_SOURCE
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <string.h>
9 #include <ctype.h>
10 #include <iconv.h>
11
12 static FILE *cmitmsg, *patchfile;
13
14 static int keep_subject = 0;
15 static int metainfo_utf8 = 0;
16 static char line[1000];
17 static char date[1000];
18 static char name[1000];
19 static char email[1000];
20 static char subject[1000];
21
22 static enum  {
23         TE_DONTCARE, TE_QP, TE_BASE64,
24 } transfer_encoding;
25 static char charset[256];
26
27 static char multipart_boundary[1000];
28 static int multipart_boundary_len;
29 static int patch_lines = 0;
30
31 static char *sanity_check(char *name, char *email)
32 {
33         int len = strlen(name);
34         if (len < 3 || len > 60)
35                 return email;
36         if (strchr(name, '@') || strchr(name, '<') || strchr(name, '>'))
37                 return email;
38         return name;
39 }
40
41 static int handle_from(char *line)
42 {
43         char *at = strchr(line, '@');
44         char *dst;
45
46         if (!at)
47                 return 0;
48
49         /*
50          * If we already have one email, don't take any confusing lines
51          */
52         if (*email && strchr(at+1, '@'))
53                 return 0;
54
55         /* Pick up the string around '@', possibly delimited with <>
56          * pair; that is the email part.  White them out while copying.
57          */
58         while (at > line) {
59                 char c = at[-1];
60                 if (isspace(c))
61                         break;
62                 if (c == '<') {
63                         at[-1] = ' ';
64                         break;
65                 }
66                 at--;
67         }
68         dst = email;
69         for (;;) {
70                 unsigned char c = *at;
71                 if (!c || c == '>' || isspace(c)) {
72                         if (c == '>')
73                                 *at = ' ';
74                         break;
75                 }
76                 *at++ = ' ';
77                 *dst++ = c;
78         }
79         *dst++ = 0;
80
81         /* The remainder is name.  It could be "John Doe <john.doe@xz>"
82          * or "john.doe@xz (John Doe)", but we have whited out the
83          * email part, so trim from both ends, possibly removing
84          * the () pair at the end.
85          */
86         at = line + strlen(line);
87         while (at > line) {
88                 unsigned char c = *--at;
89                 if (!isspace(c)) {
90                         at[(c == ')') ? 0 : 1] = 0;
91                         break;
92                 }
93         }
94
95         at = line;
96         for (;;) {
97                 unsigned char c = *at;
98                 if (!c || !isspace(c)) {
99                         if (c == '(')
100                                 at++;
101                         break;
102                 }
103                 at++;
104         }
105         at = sanity_check(at, email);
106         strcpy(name, at);
107         return 1;
108 }
109
110 static int handle_date(char *line)
111 {
112         strcpy(date, line);
113         return 0;
114 }
115
116 static int handle_subject(char *line)
117 {
118         strcpy(subject, line);
119         return 0;
120 }
121
122 /* NOTE NOTE NOTE.  We do not claim we do full MIME.  We just attempt
123  * to have enough heuristics to grok MIME encoded patches often found
124  * on our mailing lists.  For example, we do not even treat header lines
125  * case insensitively.
126  */
127
128 static int slurp_attr(const char *line, const char *name, char *attr)
129 {
130         char *ends, *ap = strcasestr(line, name);
131         size_t sz;
132
133         if (!ap) {
134                 *attr = 0;
135                 return 0;
136         }
137         ap += strlen(name);
138         if (*ap == '"') {
139                 ap++;
140                 ends = "\"";
141         }
142         else
143                 ends = "; \t";
144         sz = strcspn(ap, ends);
145         memcpy(attr, ap, sz);
146         attr[sz] = 0;
147         return 1;
148 }
149
150 static int handle_subcontent_type(char *line)
151 {
152         /* We do not want to mess with boundary.  Note that we do not
153          * handle nested multipart.
154          */
155         slurp_attr(line, "charset=", charset);
156         if (*charset) {
157                 int i, c;
158                 for (i = 0; (c = charset[i]) != 0; i++)
159                         charset[i] = tolower(c);
160         }
161         return 0;
162 }
163
164 static int handle_content_type(char *line)
165 {
166         *multipart_boundary = 0;
167         if (slurp_attr(line, "boundary=", multipart_boundary + 2)) {
168                 memcpy(multipart_boundary, "--", 2);
169                 multipart_boundary_len = strlen(multipart_boundary);
170         }
171         slurp_attr(line, "charset=", charset);
172         return 0;
173 }
174
175 static int handle_content_transfer_encoding(char *line)
176 {
177         if (strcasestr(line, "base64"))
178                 transfer_encoding = TE_BASE64;
179         else if (strcasestr(line, "quoted-printable"))
180                 transfer_encoding = TE_QP;
181         else
182                 transfer_encoding = TE_DONTCARE;
183         return 0;
184 }
185
186 static int is_multipart_boundary(const char *line)
187 {
188         return (!memcmp(line, multipart_boundary, multipart_boundary_len));
189 }
190
191 static int eatspace(char *line)
192 {
193         int len = strlen(line);
194         while (len > 0 && isspace(line[len-1]))
195                 line[--len] = 0;
196         return len;
197 }
198
199 #define SEEN_FROM 01
200 #define SEEN_DATE 02
201 #define SEEN_SUBJECT 04
202
203 /* First lines of body can have From:, Date:, and Subject: */
204 static int handle_inbody_header(int *seen, char *line)
205 {
206         if (!memcmp("From:", line, 5) && isspace(line[5])) {
207                 if (!(*seen & SEEN_FROM) && handle_from(line+6)) {
208                         *seen |= SEEN_FROM;
209                         return 1;
210                 }
211         }
212         if (!memcmp("Date:", line, 5) && isspace(line[5])) {
213                 if (!(*seen & SEEN_DATE)) {
214                         handle_date(line+6);
215                         *seen |= SEEN_DATE;
216                         return 1;
217                 }
218         }
219         if (!memcmp("Subject:", line, 8) && isspace(line[8])) {
220                 if (!(*seen & SEEN_SUBJECT)) {
221                         handle_subject(line+9);
222                         *seen |= SEEN_SUBJECT;
223                         return 1;
224                 }
225         }
226         if (!memcmp("[PATCH]", line, 7) && isspace(line[7])) {
227                 if (!(*seen & SEEN_SUBJECT)) {
228                         handle_subject(line);
229                         *seen |= SEEN_SUBJECT;
230                         return 1;
231                 }
232         }
233         return 0;
234 }
235
236 static char *cleanup_subject(char *subject)
237 {
238         if (keep_subject)
239                 return subject;
240         for (;;) {
241                 char *p;
242                 int len, remove;
243                 switch (*subject) {
244                 case 'r': case 'R':
245                         if (!memcmp("e:", subject+1, 2)) {
246                                 subject +=3;
247                                 continue;
248                         }
249                         break;
250                 case ' ': case '\t': case ':':
251                         subject++;
252                         continue;
253
254                 case '[':
255                         p = strchr(subject, ']');
256                         if (!p) {
257                                 subject++;
258                                 continue;
259                         }
260                         len = strlen(p);
261                         remove = p - subject;
262                         if (remove <= len *2) {
263                                 subject = p+1;
264                                 continue;
265                         }       
266                         break;
267                 }
268                 return subject;
269         }
270 }                       
271
272 static void cleanup_space(char *buf)
273 {
274         unsigned char c;
275         while ((c = *buf) != 0) {
276                 buf++;
277                 if (isspace(c)) {
278                         buf[-1] = ' ';
279                         c = *buf;
280                         while (isspace(c)) {
281                                 int len = strlen(buf);
282                                 memmove(buf, buf+1, len);
283                                 c = *buf;
284                         }
285                 }
286         }
287 }
288
289 typedef int (*header_fn_t)(char *);
290 struct header_def {
291         const char *name;
292         header_fn_t func;
293         int namelen;
294 };
295
296 static void check_header(char *line, int len, struct header_def *header)
297 {
298         int i;
299
300         if (header[0].namelen <= 0) {
301                 for (i = 0; header[i].name; i++)
302                         header[i].namelen = strlen(header[i].name);
303         }
304         for (i = 0; header[i].name; i++) {
305                 int len = header[i].namelen;
306                 if (!strncasecmp(line, header[i].name, len) &&
307                     line[len] == ':' && isspace(line[len + 1])) {
308                         header[i].func(line + len + 2);
309                         break;
310                 }
311         }
312 }
313
314 static void check_subheader_line(char *line, int len)
315 {
316         static struct header_def header[] = {
317                 { "Content-Type", handle_subcontent_type },
318                 { "Content-Transfer-Encoding",
319                   handle_content_transfer_encoding },
320                 { NULL },
321         };
322         check_header(line, len, header);
323 }
324 static void check_header_line(char *line, int len)
325 {
326         static struct header_def header[] = {
327                 { "From", handle_from },
328                 { "Date", handle_date },
329                 { "Subject", handle_subject },
330                 { "Content-Type", handle_content_type },
331                 { "Content-Transfer-Encoding",
332                   handle_content_transfer_encoding },
333                 { NULL },
334         };
335         check_header(line, len, header);
336 }
337
338 static int read_one_header_line(char *line, int sz, FILE *in)
339 {
340         int ofs = 0;
341         while (ofs < sz) {
342                 int peek, len;
343                 if (fgets(line + ofs, sz - ofs, in) == NULL)
344                         return ofs;
345                 len = eatspace(line + ofs);
346                 if (len == 0)
347                         return ofs;
348                 peek = fgetc(in); ungetc(peek, in);
349                 if (peek == ' ' || peek == '\t') {
350                         /* Yuck, 2822 header "folding" */
351                         ofs += len;
352                         continue;
353                 }
354                 return ofs + len;
355         }
356         return ofs;
357 }
358
359 static unsigned hexval(int c)
360 {
361         if (c >= '0' && c <= '9')
362                 return c - '0';
363         if (c >= 'a' && c <= 'f')
364                 return c - 'a' + 10;
365         if (c >= 'A' && c <= 'F')
366                 return c - 'A' + 10;
367         return ~0;
368 }
369
370 static int decode_q_segment(char *in, char *ot, char *ep)
371 {
372         int c;
373         while ((c = *in++) != 0 && (in <= ep)) {
374                 if (c == '=') {
375                         int d = *in++;
376                         if (d == '\n' || !d)
377                                 break; /* drop trailing newline */
378                         *ot++ = ((hexval(d) << 4) | hexval(*in++));
379                 }
380                 else
381                         *ot++ = c;
382         }
383         *ot = 0;
384         return 0;
385 }
386
387 static int decode_b_segment(char *in, char *ot, char *ep)
388 {
389         /* Decode in..ep, possibly in-place to ot */
390         int c, pos = 0, acc = 0;
391
392         while ((c = *in++) != 0 && (in <= ep)) {
393                 if (c == '+')
394                         c = 62;
395                 else if (c == '/')
396                         c = 63;
397                 else if ('A' <= c && c <= 'Z')
398                         c -= 'A';
399                 else if ('a' <= c && c <= 'z')
400                         c -= 'a' - 26;
401                 else if ('0' <= c && c <= '9')
402                         c -= '0' - 52;
403                 else if (c == '=') {
404                         /* padding is almost like (c == 0), except we do
405                          * not output NUL resulting only from it;
406                          * for now we just trust the data.
407                          */
408                         c = 0;
409                 }
410                 else
411                         continue; /* garbage */
412                 switch (pos++) {
413                 case 0:
414                         acc = (c << 2);
415                         break;
416                 case 1:
417                         *ot++ = (acc | (c >> 4));
418                         acc = (c & 15) << 4;
419                         break;
420                 case 2:
421                         *ot++ = (acc | (c >> 2));
422                         acc = (c & 3) << 6;
423                         break;
424                 case 3:
425                         *ot++ = (acc | c);
426                         acc = pos = 0;
427                         break;
428                 }
429         }
430         *ot = 0;
431         return 0;
432 }
433
434 static void convert_to_utf8(char *line, char *charset)
435 {
436         if (*charset) {
437                 char *in, *out;
438                 size_t insize, outsize, nrc;
439                 char outbuf[4096]; /* cheat */
440                 iconv_t conv = iconv_open("utf-8", charset);
441
442                 if (conv == (iconv_t) -1) {
443                         fprintf(stderr, "cannot convert from %s to utf-8\n",
444                                 charset);
445                         *charset = 0;
446                         return;
447                 }
448                 in = line;
449                 insize = strlen(in);
450                 out = outbuf;
451                 outsize = sizeof(outbuf);
452                 nrc = iconv(conv, &in, &insize, &out, &outsize);
453                 iconv_close(conv);
454                 if (nrc == (size_t) -1)
455                         return;
456                 *out = 0;
457                 strcpy(line, outbuf);
458         }
459 }
460
461 static void decode_header_bq(char *it)
462 {
463         char *in, *out, *ep, *cp, *sp;
464         char outbuf[1000];
465
466         in = it;
467         out = outbuf;
468         while ((ep = strstr(in, "=?")) != NULL) {
469                 int sz, encoding;
470                 char charset_q[256], piecebuf[256];
471                 if (in != ep) {
472                         sz = ep - in;
473                         memcpy(out, in, sz);
474                         out += sz;
475                         in += sz;
476                 }
477                 /* E.g.
478                  * ep : "=?iso-2022-jp?B?GyR...?= foo"
479                  * ep : "=?ISO-8859-1?Q?Foo=FCbar?= baz"
480                  */
481                 ep += 2;
482                 cp = strchr(ep, '?');
483                 if (!cp)
484                         return; /* no munging */
485                 for (sp = ep; sp < cp; sp++)
486                         charset_q[sp - ep] = tolower(*sp);
487                 charset_q[cp - ep] = 0;
488                 encoding = cp[1];
489                 if (!encoding || cp[2] != '?')
490                         return; /* no munging */
491                 ep = strstr(cp + 3, "?=");
492                 if (!ep)
493                         return; /* no munging */
494                 switch (tolower(encoding)) {
495                 default:
496                         return; /* no munging */
497                 case 'b':
498                         sz = decode_b_segment(cp + 3, piecebuf, ep);
499                         break;
500                 case 'q':
501                         sz = decode_q_segment(cp + 3, piecebuf, ep);
502                         break;
503                 }
504                 if (sz < 0)
505                         return;
506                 if (metainfo_utf8)
507                         convert_to_utf8(piecebuf, charset_q);
508                 strcpy(out, piecebuf);
509                 out += strlen(out);
510                 in = ep + 2;
511         }
512         strcpy(out, in);
513         strcpy(it, outbuf);
514 }
515
516 static void decode_transfer_encoding(char *line)
517 {
518         char *ep;
519
520         switch (transfer_encoding) {
521         case TE_QP:
522                 ep = line + strlen(line);
523                 decode_q_segment(line, line, ep);
524                 break;
525         case TE_BASE64:
526                 ep = line + strlen(line);
527                 decode_b_segment(line, line, ep);
528                 break;
529         case TE_DONTCARE:
530                 break;
531         }
532 }
533
534 static void handle_info(void)
535 {
536         char *sub;
537         static int done_info = 0;
538
539         if (done_info)
540                 return;
541
542         done_info = 1;
543         sub = cleanup_subject(subject);
544         cleanup_space(name);
545         cleanup_space(date);
546         cleanup_space(email);
547         cleanup_space(sub);
548
549         /* Unwrap inline B and Q encoding, and optionally
550          * normalize the meta information to utf8.
551          */
552         decode_header_bq(name);
553         decode_header_bq(date);
554         decode_header_bq(email);
555         decode_header_bq(sub);
556         printf("Author: %s\nEmail: %s\nSubject: %s\nDate: %s\n\n",
557                name, email, sub, date);
558 }
559
560 /* We are inside message body and have read line[] already.
561  * Spit out the commit log.
562  */
563 static int handle_commit_msg(void)
564 {
565         if (!cmitmsg)
566                 return 0;
567         do {
568                 if (!memcmp("diff -", line, 6) ||
569                     !memcmp("---", line, 3) ||
570                     !memcmp("Index: ", line, 7))
571                         break;
572                 if ((multipart_boundary[0] && is_multipart_boundary(line))) {
573                         /* We come here when the first part had only
574                          * the commit message without any patch.  We
575                          * pretend we have not seen this line yet, and
576                          * go back to the loop.
577                          */
578                         return 1;
579                 }
580
581                 /* Unwrap transfer encoding and optionally
582                  * normalize the log message to UTF-8.
583                  */
584                 decode_transfer_encoding(line);
585                 if (metainfo_utf8)
586                         convert_to_utf8(line, charset);
587                 fputs(line, cmitmsg);
588         } while (fgets(line, sizeof(line), stdin) != NULL);
589         fclose(cmitmsg);
590         cmitmsg = NULL;
591         return 0;
592 }
593
594 /* We have done the commit message and have the first
595  * line of the patch in line[].
596  */
597 static void handle_patch(void)
598 {
599         do {
600                 if (multipart_boundary[0] && is_multipart_boundary(line))
601                         break;
602                 /* Only unwrap transfer encoding but otherwise do not
603                  * do anything.  We do *NOT* want UTF-8 conversion
604                  * here; we are dealing with the user payload.
605                  */
606                 decode_transfer_encoding(line);
607                 fputs(line, patchfile);
608                 patch_lines++;
609         } while (fgets(line, sizeof(line), stdin) != NULL);
610 }
611
612 /* multipart boundary and transfer encoding are set up for us, and we
613  * are at the end of the sub header.  do equivalent of handle_body up
614  * to the next boundary without closing patchfile --- we will expect
615  * that the first part to contain commit message and a patch, and
616  * handle other parts as pure patches.
617  */
618 static int handle_multipart_one_part(void)
619 {
620         int seen = 0;
621         int n = 0;
622         int len;
623
624         while (fgets(line, sizeof(line), stdin) != NULL) {
625         again:
626                 len = eatspace(line);
627                 n++;
628                 if (!len)
629                         continue;
630                 if (is_multipart_boundary(line))
631                         break;
632                 if (0 <= seen && handle_inbody_header(&seen, line))
633                         continue;
634                 seen = -1; /* no more inbody headers */
635                 line[len] = '\n';
636                 handle_info();
637                 if (handle_commit_msg())
638                         goto again;
639                 handle_patch();
640                 break;
641         }
642         if (n == 0)
643                 return -1;
644         return 0;
645 }
646
647 static void handle_multipart_body(void)
648 {
649         int part_num = 0;
650
651         /* Skip up to the first boundary */
652         while (fgets(line, sizeof(line), stdin) != NULL)
653                 if (is_multipart_boundary(line)) {
654                         part_num = 1;
655                         break;
656                 }
657         if (!part_num)
658                 return;
659         /* We are on boundary line.  Start slurping the subhead. */
660         while (1) {
661                 int len = read_one_header_line(line, sizeof(line), stdin);
662                 if (!len) {
663                         if (handle_multipart_one_part() < 0)
664                                 return;
665                 }
666                 else
667                         check_subheader_line(line, len);
668         }
669         fclose(patchfile);
670         if (!patch_lines) {
671                 fprintf(stderr, "No patch found\n");
672                 exit(1);
673         }
674 }
675
676 /* Non multipart message */
677 static void handle_body(void)
678 {
679         int seen = 0;
680
681         while (fgets(line, sizeof(line), stdin) != NULL) {
682                 int len = eatspace(line);
683                 if (!len)
684                         continue;
685                 if (0 <= seen && handle_inbody_header(&seen, line))
686                         continue;
687                 seen = -1; /* no more inbody headers */
688                 line[len] = '\n';
689                 handle_info();
690                 handle_commit_msg();
691                 handle_patch();
692                 break;
693         }
694         fclose(patchfile);
695         if (!patch_lines) {
696                 fprintf(stderr, "No patch found\n");
697                 exit(1);
698         }
699 }
700
701 static const char mailinfo_usage[] =
702         "git-mailinfo [-k] [-u] msg patch <mail >info";
703
704 static void usage(void) {
705         fprintf(stderr, "%s\n", mailinfo_usage);
706         exit(1);
707 }
708
709 int main(int argc, char **argv)
710 {
711         while (1 < argc && argv[1][0] == '-') {
712                 if (!strcmp(argv[1], "-k"))
713                         keep_subject = 1;
714                 else if (!strcmp(argv[1], "-u"))
715                         metainfo_utf8 = 1;
716                 else
717                         usage();
718                 argc--; argv++;
719         }
720
721         if (argc != 3)
722                 usage();
723         cmitmsg = fopen(argv[1], "w");
724         if (!cmitmsg) {
725                 perror(argv[1]);
726                 exit(1);
727         }
728         patchfile = fopen(argv[2], "w");
729         if (!patchfile) {
730                 perror(argv[2]);
731                 exit(1);
732         }
733         while (1) {
734                 int len = read_one_header_line(line, sizeof(line), stdin);
735                 if (!len) {
736                         if (multipart_boundary[0])
737                                 handle_multipart_body();
738                         else
739                                 handle_body();
740                         break;
741                 }
742                 check_header_line(line, len);
743         }
744         return 0;
745 }