mailinfo: barf and exist upon nested multipart.
[git.git] / tools / mailinfo.c
1 /*
2  * Another stupid program, this one parsing the headers of an
3  * email to figure out authorship and subject
4  */
5 #define _GNU_SOURCE
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <string.h>
9 #include <ctype.h>
10 #include <iconv.h>
11
12 static FILE *cmitmsg, *patchfile;
13
14 static int keep_subject = 0;
15 static int metainfo_utf8 = 0;
16 static char line[1000];
17 static char date[1000];
18 static char name[1000];
19 static char email[1000];
20 static char subject[1000];
21
22 static enum  {
23         TE_DONTCARE, TE_QP, TE_BASE64,
24 } transfer_encoding;
25 static char charset[256];
26
27 static char multipart_boundary[1000];
28 static int multipart_boundary_len;
29 static int patch_lines = 0;
30
31 static char *sanity_check(char *name, char *email)
32 {
33         int len = strlen(name);
34         if (len < 3 || len > 60)
35                 return email;
36         if (strchr(name, '@') || strchr(name, '<') || strchr(name, '>'))
37                 return email;
38         return name;
39 }
40
41 static int handle_from(char *line)
42 {
43         char *at = strchr(line, '@');
44         char *dst;
45
46         if (!at)
47                 return 0;
48
49         /*
50          * If we already have one email, don't take any confusing lines
51          */
52         if (*email && strchr(at+1, '@'))
53                 return 0;
54
55         /* Pick up the string around '@', possibly delimited with <>
56          * pair; that is the email part.  White them out while copying.
57          */
58         while (at > line) {
59                 char c = at[-1];
60                 if (isspace(c))
61                         break;
62                 if (c == '<') {
63                         at[-1] = ' ';
64                         break;
65                 }
66                 at--;
67         }
68         dst = email;
69         for (;;) {
70                 unsigned char c = *at;
71                 if (!c || c == '>' || isspace(c)) {
72                         if (c == '>')
73                                 *at = ' ';
74                         break;
75                 }
76                 *at++ = ' ';
77                 *dst++ = c;
78         }
79         *dst++ = 0;
80
81         /* The remainder is name.  It could be "John Doe <john.doe@xz>"
82          * or "john.doe@xz (John Doe)", but we have whited out the
83          * email part, so trim from both ends, possibly removing
84          * the () pair at the end.
85          */
86         at = line + strlen(line);
87         while (at > line) {
88                 unsigned char c = *--at;
89                 if (!isspace(c)) {
90                         at[(c == ')') ? 0 : 1] = 0;
91                         break;
92                 }
93         }
94
95         at = line;
96         for (;;) {
97                 unsigned char c = *at;
98                 if (!c || !isspace(c)) {
99                         if (c == '(')
100                                 at++;
101                         break;
102                 }
103                 at++;
104         }
105         at = sanity_check(at, email);
106         strcpy(name, at);
107         return 1;
108 }
109
110 static int handle_date(char *line)
111 {
112         strcpy(date, line);
113         return 0;
114 }
115
116 static int handle_subject(char *line)
117 {
118         strcpy(subject, line);
119         return 0;
120 }
121
122 /* NOTE NOTE NOTE.  We do not claim we do full MIME.  We just attempt
123  * to have enough heuristics to grok MIME encoded patches often found
124  * on our mailing lists.  For example, we do not even treat header lines
125  * case insensitively.
126  */
127
128 static int slurp_attr(const char *line, const char *name, char *attr)
129 {
130         char *ends, *ap = strcasestr(line, name);
131         size_t sz;
132
133         if (!ap) {
134                 *attr = 0;
135                 return 0;
136         }
137         ap += strlen(name);
138         if (*ap == '"') {
139                 ap++;
140                 ends = "\"";
141         }
142         else
143                 ends = "; \t";
144         sz = strcspn(ap, ends);
145         memcpy(attr, ap, sz);
146         attr[sz] = 0;
147         return 1;
148 }
149
150 static int handle_subcontent_type(char *line)
151 {
152         /* We do not want to mess with boundary.  Note that we do not
153          * handle nested multipart.
154          */
155         if (strcasestr(line, "boundary=")) {
156                 fprintf(stderr, "Not handling nested multipart message.\n");
157                 exit(1);
158         }
159         slurp_attr(line, "charset=", charset);
160         if (*charset) {
161                 int i, c;
162                 for (i = 0; (c = charset[i]) != 0; i++)
163                         charset[i] = tolower(c);
164         }
165         return 0;
166 }
167
168 static int handle_content_type(char *line)
169 {
170         *multipart_boundary = 0;
171         if (slurp_attr(line, "boundary=", multipart_boundary + 2)) {
172                 memcpy(multipart_boundary, "--", 2);
173                 multipart_boundary_len = strlen(multipart_boundary);
174         }
175         slurp_attr(line, "charset=", charset);
176         return 0;
177 }
178
179 static int handle_content_transfer_encoding(char *line)
180 {
181         if (strcasestr(line, "base64"))
182                 transfer_encoding = TE_BASE64;
183         else if (strcasestr(line, "quoted-printable"))
184                 transfer_encoding = TE_QP;
185         else
186                 transfer_encoding = TE_DONTCARE;
187         return 0;
188 }
189
190 static int is_multipart_boundary(const char *line)
191 {
192         return (!memcmp(line, multipart_boundary, multipart_boundary_len));
193 }
194
195 static int eatspace(char *line)
196 {
197         int len = strlen(line);
198         while (len > 0 && isspace(line[len-1]))
199                 line[--len] = 0;
200         return len;
201 }
202
203 #define SEEN_FROM 01
204 #define SEEN_DATE 02
205 #define SEEN_SUBJECT 04
206
207 /* First lines of body can have From:, Date:, and Subject: */
208 static int handle_inbody_header(int *seen, char *line)
209 {
210         if (!memcmp("From:", line, 5) && isspace(line[5])) {
211                 if (!(*seen & SEEN_FROM) && handle_from(line+6)) {
212                         *seen |= SEEN_FROM;
213                         return 1;
214                 }
215         }
216         if (!memcmp("Date:", line, 5) && isspace(line[5])) {
217                 if (!(*seen & SEEN_DATE)) {
218                         handle_date(line+6);
219                         *seen |= SEEN_DATE;
220                         return 1;
221                 }
222         }
223         if (!memcmp("Subject:", line, 8) && isspace(line[8])) {
224                 if (!(*seen & SEEN_SUBJECT)) {
225                         handle_subject(line+9);
226                         *seen |= SEEN_SUBJECT;
227                         return 1;
228                 }
229         }
230         if (!memcmp("[PATCH]", line, 7) && isspace(line[7])) {
231                 if (!(*seen & SEEN_SUBJECT)) {
232                         handle_subject(line);
233                         *seen |= SEEN_SUBJECT;
234                         return 1;
235                 }
236         }
237         return 0;
238 }
239
240 static char *cleanup_subject(char *subject)
241 {
242         if (keep_subject)
243                 return subject;
244         for (;;) {
245                 char *p;
246                 int len, remove;
247                 switch (*subject) {
248                 case 'r': case 'R':
249                         if (!memcmp("e:", subject+1, 2)) {
250                                 subject +=3;
251                                 continue;
252                         }
253                         break;
254                 case ' ': case '\t': case ':':
255                         subject++;
256                         continue;
257
258                 case '[':
259                         p = strchr(subject, ']');
260                         if (!p) {
261                                 subject++;
262                                 continue;
263                         }
264                         len = strlen(p);
265                         remove = p - subject;
266                         if (remove <= len *2) {
267                                 subject = p+1;
268                                 continue;
269                         }       
270                         break;
271                 }
272                 return subject;
273         }
274 }                       
275
276 static void cleanup_space(char *buf)
277 {
278         unsigned char c;
279         while ((c = *buf) != 0) {
280                 buf++;
281                 if (isspace(c)) {
282                         buf[-1] = ' ';
283                         c = *buf;
284                         while (isspace(c)) {
285                                 int len = strlen(buf);
286                                 memmove(buf, buf+1, len);
287                                 c = *buf;
288                         }
289                 }
290         }
291 }
292
293 typedef int (*header_fn_t)(char *);
294 struct header_def {
295         const char *name;
296         header_fn_t func;
297         int namelen;
298 };
299
300 static void check_header(char *line, int len, struct header_def *header)
301 {
302         int i;
303
304         if (header[0].namelen <= 0) {
305                 for (i = 0; header[i].name; i++)
306                         header[i].namelen = strlen(header[i].name);
307         }
308         for (i = 0; header[i].name; i++) {
309                 int len = header[i].namelen;
310                 if (!strncasecmp(line, header[i].name, len) &&
311                     line[len] == ':' && isspace(line[len + 1])) {
312                         header[i].func(line + len + 2);
313                         break;
314                 }
315         }
316 }
317
318 static void check_subheader_line(char *line, int len)
319 {
320         static struct header_def header[] = {
321                 { "Content-Type", handle_subcontent_type },
322                 { "Content-Transfer-Encoding",
323                   handle_content_transfer_encoding },
324                 { NULL },
325         };
326         check_header(line, len, header);
327 }
328 static void check_header_line(char *line, int len)
329 {
330         static struct header_def header[] = {
331                 { "From", handle_from },
332                 { "Date", handle_date },
333                 { "Subject", handle_subject },
334                 { "Content-Type", handle_content_type },
335                 { "Content-Transfer-Encoding",
336                   handle_content_transfer_encoding },
337                 { NULL },
338         };
339         check_header(line, len, header);
340 }
341
342 static int read_one_header_line(char *line, int sz, FILE *in)
343 {
344         int ofs = 0;
345         while (ofs < sz) {
346                 int peek, len;
347                 if (fgets(line + ofs, sz - ofs, in) == NULL)
348                         return ofs;
349                 len = eatspace(line + ofs);
350                 if (len == 0)
351                         return ofs;
352                 peek = fgetc(in); ungetc(peek, in);
353                 if (peek == ' ' || peek == '\t') {
354                         /* Yuck, 2822 header "folding" */
355                         ofs += len;
356                         continue;
357                 }
358                 return ofs + len;
359         }
360         return ofs;
361 }
362
363 static unsigned hexval(int c)
364 {
365         if (c >= '0' && c <= '9')
366                 return c - '0';
367         if (c >= 'a' && c <= 'f')
368                 return c - 'a' + 10;
369         if (c >= 'A' && c <= 'F')
370                 return c - 'A' + 10;
371         return ~0;
372 }
373
374 static int decode_q_segment(char *in, char *ot, char *ep)
375 {
376         int c;
377         while ((c = *in++) != 0 && (in <= ep)) {
378                 if (c == '=') {
379                         int d = *in++;
380                         if (d == '\n' || !d)
381                                 break; /* drop trailing newline */
382                         *ot++ = ((hexval(d) << 4) | hexval(*in++));
383                 }
384                 else
385                         *ot++ = c;
386         }
387         *ot = 0;
388         return 0;
389 }
390
391 static int decode_b_segment(char *in, char *ot, char *ep)
392 {
393         /* Decode in..ep, possibly in-place to ot */
394         int c, pos = 0, acc = 0;
395
396         while ((c = *in++) != 0 && (in <= ep)) {
397                 if (c == '+')
398                         c = 62;
399                 else if (c == '/')
400                         c = 63;
401                 else if ('A' <= c && c <= 'Z')
402                         c -= 'A';
403                 else if ('a' <= c && c <= 'z')
404                         c -= 'a' - 26;
405                 else if ('0' <= c && c <= '9')
406                         c -= '0' - 52;
407                 else if (c == '=') {
408                         /* padding is almost like (c == 0), except we do
409                          * not output NUL resulting only from it;
410                          * for now we just trust the data.
411                          */
412                         c = 0;
413                 }
414                 else
415                         continue; /* garbage */
416                 switch (pos++) {
417                 case 0:
418                         acc = (c << 2);
419                         break;
420                 case 1:
421                         *ot++ = (acc | (c >> 4));
422                         acc = (c & 15) << 4;
423                         break;
424                 case 2:
425                         *ot++ = (acc | (c >> 2));
426                         acc = (c & 3) << 6;
427                         break;
428                 case 3:
429                         *ot++ = (acc | c);
430                         acc = pos = 0;
431                         break;
432                 }
433         }
434         *ot = 0;
435         return 0;
436 }
437
438 static void convert_to_utf8(char *line, char *charset)
439 {
440         if (*charset) {
441                 char *in, *out;
442                 size_t insize, outsize, nrc;
443                 char outbuf[4096]; /* cheat */
444                 iconv_t conv = iconv_open("utf-8", charset);
445
446                 if (conv == (iconv_t) -1) {
447                         fprintf(stderr, "cannot convert from %s to utf-8\n",
448                                 charset);
449                         *charset = 0;
450                         return;
451                 }
452                 in = line;
453                 insize = strlen(in);
454                 out = outbuf;
455                 outsize = sizeof(outbuf);
456                 nrc = iconv(conv, &in, &insize, &out, &outsize);
457                 iconv_close(conv);
458                 if (nrc == (size_t) -1)
459                         return;
460                 *out = 0;
461                 strcpy(line, outbuf);
462         }
463 }
464
465 static void decode_header_bq(char *it)
466 {
467         char *in, *out, *ep, *cp, *sp;
468         char outbuf[1000];
469
470         in = it;
471         out = outbuf;
472         while ((ep = strstr(in, "=?")) != NULL) {
473                 int sz, encoding;
474                 char charset_q[256], piecebuf[256];
475                 if (in != ep) {
476                         sz = ep - in;
477                         memcpy(out, in, sz);
478                         out += sz;
479                         in += sz;
480                 }
481                 /* E.g.
482                  * ep : "=?iso-2022-jp?B?GyR...?= foo"
483                  * ep : "=?ISO-8859-1?Q?Foo=FCbar?= baz"
484                  */
485                 ep += 2;
486                 cp = strchr(ep, '?');
487                 if (!cp)
488                         return; /* no munging */
489                 for (sp = ep; sp < cp; sp++)
490                         charset_q[sp - ep] = tolower(*sp);
491                 charset_q[cp - ep] = 0;
492                 encoding = cp[1];
493                 if (!encoding || cp[2] != '?')
494                         return; /* no munging */
495                 ep = strstr(cp + 3, "?=");
496                 if (!ep)
497                         return; /* no munging */
498                 switch (tolower(encoding)) {
499                 default:
500                         return; /* no munging */
501                 case 'b':
502                         sz = decode_b_segment(cp + 3, piecebuf, ep);
503                         break;
504                 case 'q':
505                         sz = decode_q_segment(cp + 3, piecebuf, ep);
506                         break;
507                 }
508                 if (sz < 0)
509                         return;
510                 if (metainfo_utf8)
511                         convert_to_utf8(piecebuf, charset_q);
512                 strcpy(out, piecebuf);
513                 out += strlen(out);
514                 in = ep + 2;
515         }
516         strcpy(out, in);
517         strcpy(it, outbuf);
518 }
519
520 static void decode_transfer_encoding(char *line)
521 {
522         char *ep;
523
524         switch (transfer_encoding) {
525         case TE_QP:
526                 ep = line + strlen(line);
527                 decode_q_segment(line, line, ep);
528                 break;
529         case TE_BASE64:
530                 ep = line + strlen(line);
531                 decode_b_segment(line, line, ep);
532                 break;
533         case TE_DONTCARE:
534                 break;
535         }
536 }
537
538 static void handle_info(void)
539 {
540         char *sub;
541         static int done_info = 0;
542
543         if (done_info)
544                 return;
545
546         done_info = 1;
547         sub = cleanup_subject(subject);
548         cleanup_space(name);
549         cleanup_space(date);
550         cleanup_space(email);
551         cleanup_space(sub);
552
553         /* Unwrap inline B and Q encoding, and optionally
554          * normalize the meta information to utf8.
555          */
556         decode_header_bq(name);
557         decode_header_bq(date);
558         decode_header_bq(email);
559         decode_header_bq(sub);
560         printf("Author: %s\nEmail: %s\nSubject: %s\nDate: %s\n\n",
561                name, email, sub, date);
562 }
563
564 /* We are inside message body and have read line[] already.
565  * Spit out the commit log.
566  */
567 static int handle_commit_msg(void)
568 {
569         if (!cmitmsg)
570                 return 0;
571         do {
572                 if (!memcmp("diff -", line, 6) ||
573                     !memcmp("---", line, 3) ||
574                     !memcmp("Index: ", line, 7))
575                         break;
576                 if ((multipart_boundary[0] && is_multipart_boundary(line))) {
577                         /* We come here when the first part had only
578                          * the commit message without any patch.  We
579                          * pretend we have not seen this line yet, and
580                          * go back to the loop.
581                          */
582                         return 1;
583                 }
584
585                 /* Unwrap transfer encoding and optionally
586                  * normalize the log message to UTF-8.
587                  */
588                 decode_transfer_encoding(line);
589                 if (metainfo_utf8)
590                         convert_to_utf8(line, charset);
591                 fputs(line, cmitmsg);
592         } while (fgets(line, sizeof(line), stdin) != NULL);
593         fclose(cmitmsg);
594         cmitmsg = NULL;
595         return 0;
596 }
597
598 /* We have done the commit message and have the first
599  * line of the patch in line[].
600  */
601 static void handle_patch(void)
602 {
603         do {
604                 if (multipart_boundary[0] && is_multipart_boundary(line))
605                         break;
606                 /* Only unwrap transfer encoding but otherwise do not
607                  * do anything.  We do *NOT* want UTF-8 conversion
608                  * here; we are dealing with the user payload.
609                  */
610                 decode_transfer_encoding(line);
611                 fputs(line, patchfile);
612                 patch_lines++;
613         } while (fgets(line, sizeof(line), stdin) != NULL);
614 }
615
616 /* multipart boundary and transfer encoding are set up for us, and we
617  * are at the end of the sub header.  do equivalent of handle_body up
618  * to the next boundary without closing patchfile --- we will expect
619  * that the first part to contain commit message and a patch, and
620  * handle other parts as pure patches.
621  */
622 static int handle_multipart_one_part(void)
623 {
624         int seen = 0;
625         int n = 0;
626         int len;
627
628         while (fgets(line, sizeof(line), stdin) != NULL) {
629         again:
630                 len = eatspace(line);
631                 n++;
632                 if (!len)
633                         continue;
634                 if (is_multipart_boundary(line))
635                         break;
636                 if (0 <= seen && handle_inbody_header(&seen, line))
637                         continue;
638                 seen = -1; /* no more inbody headers */
639                 line[len] = '\n';
640                 handle_info();
641                 if (handle_commit_msg())
642                         goto again;
643                 handle_patch();
644                 break;
645         }
646         if (n == 0)
647                 return -1;
648         return 0;
649 }
650
651 static void handle_multipart_body(void)
652 {
653         int part_num = 0;
654
655         /* Skip up to the first boundary */
656         while (fgets(line, sizeof(line), stdin) != NULL)
657                 if (is_multipart_boundary(line)) {
658                         part_num = 1;
659                         break;
660                 }
661         if (!part_num)
662                 return;
663         /* We are on boundary line.  Start slurping the subhead. */
664         while (1) {
665                 int len = read_one_header_line(line, sizeof(line), stdin);
666                 if (!len) {
667                         if (handle_multipart_one_part() < 0)
668                                 return;
669                 }
670                 else
671                         check_subheader_line(line, len);
672         }
673         fclose(patchfile);
674         if (!patch_lines) {
675                 fprintf(stderr, "No patch found\n");
676                 exit(1);
677         }
678 }
679
680 /* Non multipart message */
681 static void handle_body(void)
682 {
683         int seen = 0;
684
685         while (fgets(line, sizeof(line), stdin) != NULL) {
686                 int len = eatspace(line);
687                 if (!len)
688                         continue;
689                 if (0 <= seen && handle_inbody_header(&seen, line))
690                         continue;
691                 seen = -1; /* no more inbody headers */
692                 line[len] = '\n';
693                 handle_info();
694                 handle_commit_msg();
695                 handle_patch();
696                 break;
697         }
698         fclose(patchfile);
699         if (!patch_lines) {
700                 fprintf(stderr, "No patch found\n");
701                 exit(1);
702         }
703 }
704
705 static const char mailinfo_usage[] =
706         "git-mailinfo [-k] [-u] msg patch <mail >info";
707
708 static void usage(void) {
709         fprintf(stderr, "%s\n", mailinfo_usage);
710         exit(1);
711 }
712
713 int main(int argc, char **argv)
714 {
715         while (1 < argc && argv[1][0] == '-') {
716                 if (!strcmp(argv[1], "-k"))
717                         keep_subject = 1;
718                 else if (!strcmp(argv[1], "-u"))
719                         metainfo_utf8 = 1;
720                 else
721                         usage();
722                 argc--; argv++;
723         }
724
725         if (argc != 3)
726                 usage();
727         cmitmsg = fopen(argv[1], "w");
728         if (!cmitmsg) {
729                 perror(argv[1]);
730                 exit(1);
731         }
732         patchfile = fopen(argv[2], "w");
733         if (!patchfile) {
734                 perror(argv[2]);
735                 exit(1);
736         }
737         while (1) {
738                 int len = read_one_header_line(line, sizeof(line), stdin);
739                 if (!len) {
740                         if (multipart_boundary[0])
741                                 handle_multipart_body();
742                         else
743                                 handle_body();
744                         break;
745                 }
746                 check_header_line(line, len);
747         }
748         return 0;
749 }