summaryrefslogtreecommitdiff
blob: f12e27515eb38501a64901abb58a3c4d98f3cfd8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
#include "../include/extract.h"
#include "../include/extract_alloc.h"

#include "astring.h"
#include "document.h"
#include "mem.h"
#include "outf.h"

#include <assert.h>
#include <math.h>
#include <stdio.h>


static char_t* span_char_first(span_t* span)
{
    assert(span->chars_num > 0);
    return &span->chars[0];
}

/* Returns first char_t in a line. */
static char_t* line_item_first(line_t* line)
{
    span_t* span = line_span_first(line);
    return span_char_first(span);
}

/* Returns last char_t in a line. */
static char_t* line_item_last(line_t* line)
{
    span_t* span = line_span_last(line);
    return span_char_last(span);
}

static const char* matrix_string(const matrix_t* matrix)
{
    static char ret[64];
    snprintf(ret, sizeof(ret), "{%f %f %f %f %f %f}",
            matrix->a,
            matrix->b,
            matrix->c,
            matrix->d,
            matrix->e,
            matrix->f
            );
    return ret;
}

/* Returns total width of span. */
static double span_adv_total(span_t* span)
{
    double dx = span_char_last(span)->x - span_char_first(span)->x;
    double dy = span_char_last(span)->y - span_char_first(span)->y;
    /* We add on the advance of the last item; this avoids us returning zero if
    there's only one item. */
    double adv = span_char_last(span)->adv * matrix_expansion(span->trm);
    return sqrt(dx*dx + dy*dy) + adv;
}

/* Returns distance between end of <a> and beginning of <b>. */
static double spans_adv(
        span_t* a_span,
        char_t* a,
        char_t* b
        )
{
    double delta_x = b->x - a->x;
    double delta_y = b->y - a->y;
    double s = sqrt( delta_x*delta_x + delta_y*delta_y);
    double a_size = a->adv * matrix_expansion(a_span->trm);
    s -= a_size;
    return s;
}

static double span_angle(span_t* span)
{
    /* Assume ctm is a rotation matix. */
    double ret = atan2(-span->ctm.c, span->ctm.a);
    outfx("ctm.a=%f ctm.b=%f ret=%f", span->ctm.a, span->ctm.b, ret);
    return ret;
    /* Not sure whether this is right. Inclined text seems to be done by
    setting the ctm matrix, so not really sure what trm matrix does. This code
    assumes that it also inclines text, but maybe it only rotates individual
    glyphs? */
    /*if (span->wmode == 0) {
        return atan2(span->trm.b, span->trm.a);
    }
    else {
        return atan2(span->trm.d, span->trm.c);
    }*/
}

/* Returns static string containing brief info about span_t. */
static const char* span_string2(extract_alloc_t* alloc, span_t* span)
{
    static extract_astring_t ret = {0};
    int i;
    extract_astring_free(alloc, &ret);
    extract_astring_catc(alloc, &ret, '"');
    for (i=0; i<span->chars_num; ++i) {
        extract_astring_catc(alloc, &ret, (char) span->chars[i].ucs);
    }
    extract_astring_catc(alloc, &ret, '"');
    return ret.chars;
}

/* Returns angle of <line>. */
static double line_angle(line_t* line)
{
    /* All spans in a line must have same angle, so just use the first span. */
    assert(line->spans_num > 0);
    return span_angle(line->spans[0]);
}

/* Returns static string containing brief info about line_t. */
static const char* line_string2(extract_alloc_t* alloc, line_t* line)
{
    static extract_astring_t ret = {0};
    char    buffer[256];
    int i;
    extract_astring_free(alloc, &ret);
    snprintf(buffer, sizeof(buffer), "line x=%f y=%f spans_num=%i:",
            line->spans[0]->chars[0].x,
            line->spans[0]->chars[0].y,
            line->spans_num
            );
    extract_astring_cat(alloc, &ret, buffer);
    for (i=0; i<line->spans_num; ++i) {
        extract_astring_cat(alloc, &ret, " ");
        extract_astring_cat(alloc, &ret, span_string2(alloc, line->spans[i]));
    }
    return ret.chars;
}

/* Array of pointers to lines that are aligned and adjacent to each other so as
to form a paragraph. */
static const char* paragraph_string(extract_alloc_t* alloc, paragraph_t* paragraph)
{
    static extract_astring_t ret = {0};
    extract_astring_free(alloc, &ret);
    extract_astring_cat(alloc, &ret, "paragraph: ");
    if (paragraph->lines_num) {
        extract_astring_cat(alloc, &ret, line_string2(alloc, paragraph->lines[0]));
        if (paragraph->lines_num > 1) {
            extract_astring_cat(alloc, &ret, "..");
            extract_astring_cat(
                    alloc,
                    &ret,
                    line_string2(alloc, paragraph->lines[paragraph->lines_num-1])
                    );
        }
    }
    return ret.chars;
}

/* Returns first line in paragraph. */
static line_t* paragraph_line_first(const paragraph_t* paragraph)
{
    assert(paragraph->lines_num);
    return paragraph->lines[0];
}

/* Returns last line in paragraph. */
static line_t* paragraph_line_last(const paragraph_t* paragraph)
{
    assert(paragraph->lines_num);
    return paragraph->lines[ paragraph->lines_num-1];
}



/* Things for direct conversion of text spans into lines and paragraphs. */

/* Returns 1 if lines have same wmode and are at the same angle, else 0.

todo: allow small epsilon? */
static int lines_are_compatible(
        line_t* a,
        line_t* b,
        double  angle_a,
        int     verbose
        )
{
    if (a == b) return 0;
    if (!a->spans || !b->spans) return 0;
    if (line_span_first(a)->flags.wmode != line_span_first(b)->flags.wmode) {
        return 0;
    }
    if (matrix_cmp4(
            &line_span_first(a)->ctm,
            &line_span_first(b)->ctm
            )) {
        if (verbose) {
            outf("ctm's differ:");
            outf("    %f %f %f %f %f %f",
                    line_span_first(a)->ctm.a,
                    line_span_first(a)->ctm.b,
                    line_span_first(a)->ctm.c,
                    line_span_first(a)->ctm.d,
                    line_span_first(a)->ctm.e,
                    line_span_first(a)->ctm.f
                    );
            outf("    %f %f %f %f %f %f",
                    line_span_first(b)->ctm.a,
                    line_span_first(b)->ctm.b,
                    line_span_first(b)->ctm.c,
                    line_span_first(b)->ctm.d,
                    line_span_first(b)->ctm.e,
                    line_span_first(b)->ctm.f
                    );
        }
        return 0;
    }
    {
        double angle_b = span_angle(line_span_first(b));
        if (angle_b != angle_a) {
            outfx("%s:%i: angles differ");
            return 0;
        }
    }
    return 1;
}


/* Creates representation of span_t's that consists of a list of line_t's, with
each line_t contains pointers to a list of span_t's.

We only join spans that are at the same angle and are aligned.

On entry:
    Original value of *o_lines and *o_lines_num are ignored.

    <spans> points to array of <spans_num> span_t*'s, each pointing to
    an span_t.

On exit:
    If we succeed, we return 0, with *o_lines pointing to array of *o_lines_num
    line_t*'s, each pointing to an line_t.

    Otherwise we return -1 with errno set. *o_lines and *o_lines_num are
    undefined.
*/
static int make_lines(
        extract_alloc_t*    alloc,
        span_t**            spans,
        int                 spans_num,
        line_t***           o_lines,
        int*                o_lines_num
        )
{
    int ret = -1;

    /* Make an line_t for each span. Then we will join some of these
    line_t's together before returning. */
    int         lines_num = spans_num;
    line_t**    lines = NULL;
    int         a;
    int         num_compatible;
    int         num_joins;
    if (extract_malloc(alloc, &lines, sizeof(*lines) * lines_num)) goto end;

    /* Ensure we can clean up after error. */
    for (a=0; a<lines_num; ++a) {
        lines[a] = NULL;
    }
    for (a=0; a<lines_num; ++a) {
        if (extract_malloc(alloc, &lines[a], sizeof(line_t))) goto end;
        lines[a]->spans_num = 0;
        if (extract_malloc(alloc, &lines[a]->spans, sizeof(span_t*) * 1)) goto end;
        lines[a]->spans_num = 1;
        lines[a]->spans[0] = spans[a];
        outfx("initial line a=%i: %s", a, line_string(lines[a]));
    }

    num_compatible = 0;

    /* For each line, look for nearest aligned line, and append if found. */
    num_joins = 0;
    for (a=0; a<lines_num; ++a) {
        int b;
        int verbose = 0;
        int nearest_line_b = -1;
        double nearest_adv = 0;
        line_t* nearest_line = NULL;
        span_t* span_a;
        double angle_a;
        
        line_t* line_a = lines[a];
        if (!line_a) {
            continue;
        }

        if (0 && a < 1) verbose = 1;
        outfx("looking at line_a=%s", line_string2(line_a));

        span_a = line_span_last(line_a);
        angle_a = span_angle(span_a);
        if (verbose) outf("a=%i angle_a=%f ctm=%s: %s",
                a,
                angle_a * 180/pi,
                matrix_string(&span_a->ctm),
                line_string2(alloc, line_a)
                );

        for (b=0; b<lines_num; ++b) {
            line_t* line_b = lines[b];
            if (!line_b) {
                continue;
            }
            if (b == a) {
                continue;
            }
            if (verbose) {
                outf("");
                outf("a=%i b=%i: nearest_line_b=%i nearest_adv=%f",
                        a,
                        b,
                        nearest_line_b,
                        nearest_adv
                        );
                outf("    line_a=%s", line_string2(alloc, line_a));
                outf("    line_b=%s", line_string2(alloc, line_b));
            }
            if (!lines_are_compatible(line_a, line_b, angle_a, 0*verbose)) {
                if (verbose) outf("not compatible");
                continue;
            }

            num_compatible += 1;
            {
                /* Find angle between last glyph of span_a and first glyph of
                span_b. This detects whether the lines are lined up with each other
                (as opposed to being at the same angle but in different lines). */
                span_t* span_b = line_span_first(line_b);
                double dx = span_char_first(span_b)->x - span_char_last(span_a)->x;
                double dy = span_char_first(span_b)->y - span_char_last(span_a)->y;
                double angle_a_b = atan2(-dy, dx);
                const double angle_tolerance_deg = 1;
                if (verbose) {
                    outf("delta=(%f %f) alast=(%f %f) bfirst=(%f %f): angle_a=%f angle_a_b=%f",
                            dx,
                            dy,
                            span_char_last(span_a)->x,
                            span_char_last(span_a)->y,
                            span_char_first(span_b)->x,
                            span_char_first(span_b)->y,
                            angle_a * 180 / pi,
                            angle_a_b * 180 / pi
                            );
                }
                /* Might want to relax this when we test on non-horizontal lines.
                */
                if (fabs(angle_a_b - angle_a) * 180 / pi <= angle_tolerance_deg) {
                    /* Find distance between end of line_a and beginning of line_b. */
                    double adv = spans_adv(
                            span_a,
                            span_char_last(span_a),
                            span_char_first(span_b)
                            );
                    if (verbose) outf("nearest_adv=%f. angle_a_b=%f adv=%f",
                            nearest_adv,
                            angle_a_b,
                            adv
                            );
                    if (!nearest_line || adv < nearest_adv) {
                        nearest_line = line_b;
                        nearest_adv = adv;
                        nearest_line_b = b;
                    }
                }
                else {
                    if (verbose) outf(
                            "angle beyond tolerance: span_a last=(%f,%f) span_b first=(%f,%f) angle_a_b=%g angle_a=%g span_a.trm{a=%f b=%f}",
                            span_char_last(span_a)->x,
                            span_char_last(span_a)->y,
                            span_char_first(span_b)->x,
                            span_char_first(span_b)->y,
                            angle_a_b * 180 / pi,
                            angle_a * 180 / pi,
                            span_a->trm.a,
                            span_a->trm.b
                            );
                }
            }
        }

        if (nearest_line) {
            /* line_a and nearest_line are aligned so we can move line_b's
            spans on to the end of line_a. */
            span_t* span_b = line_span_first(nearest_line);
            b = nearest_line_b;
            if (verbose) outf("found nearest line. a=%i b=%i", a, b);

            if (1
                    && span_char_last(span_a)->ucs != ' '
                    && span_char_first(span_b)->ucs != ' '
                    ) {
                /* Find average advance of the two adjacent spans in the two
                lines we are considering joining, so that we can decide whether
                the distance between them is large enough to merit joining with
                a space character). */
                double average_adv = (
                        (span_adv_total(span_a) + span_adv_total(span_b))
                        /
                        (double) (span_a->chars_num + span_b->chars_num)
                        );

                int insert_space = (nearest_adv > 0.25 * average_adv);
                if (insert_space) {
                    /* Append space to span_a before concatenation. */
                    char_t* item;
                    if (verbose) {
                        outf("(inserted space) nearest_adv=%f average_adv=%f",
                                nearest_adv,
                                average_adv
                                );
                        outf("    a: %s", span_string(alloc, span_a));
                        outf("    b: %s", span_string(alloc, span_b));
                    }
                    if (extract_realloc2(
                            alloc,
                            &span_a->chars,
                            sizeof(char_t) * span_a->chars_num,
                            sizeof(char_t) * (span_a->chars_num + 1)
                            )) goto end;
                    item = &span_a->chars[span_a->chars_num];
                    span_a->chars_num += 1;
                    extract_bzero(item, sizeof(*item));
                    item->ucs = ' ';
                    item->adv = nearest_adv;
                }

                if (verbose) {
                    outf("Joining spans a=%i b=%i:", a, b);
                    outf("    %s", span_string2(alloc, span_a));
                    outf("    %s", span_string2(alloc, span_b));
                }
                if (0) {
                    /* Show details about what we're joining. */
                    outf(
                            "joining line insert_space=%i a=%i (y=%f) to line b=%i (y=%f). nearest_adv=%f average_adv=%f",
                            insert_space,
                            a,
                            span_char_last(span_a)->y,
                            b,
                            span_char_first(span_b)->y,
                            nearest_adv,
                            average_adv
                            );
                    outf("a: %s", span_string(alloc, span_a));
                    outf("b: %s", span_string(alloc, span_b));
                }
            }

            /* We might end up with two adjacent spaces here. But removing a
            space could result in an empty line_t, which could break various
            assumptions elsewhere. */

            if (verbose) {
                outf("Joining spans a=%i b=%i:", a, b);
                outf("    %s", span_string2(alloc, span_a));
                outf("    %s", span_string2(alloc, span_b));
            }
            if (extract_realloc2(
                    alloc,
                    &line_a->spans,
                    sizeof(span_t*) * line_a->spans_num,
                    sizeof(span_t*) * (line_a->spans_num + nearest_line->spans_num)
                    )) goto end;
            {
                int k;
                for (k=0; k<nearest_line->spans_num; ++k) {
                    line_a->spans[ line_a->spans_num + k] = nearest_line->spans[k];
                }
            }
            line_a->spans_num += nearest_line->spans_num;

            /* Ensure that we ignore nearest_line from now on. */
            extract_free(alloc, &nearest_line->spans);
            extract_free(alloc, &nearest_line);
            outfx("setting line[b=%i] to NULL", b);
            lines[b] = NULL;

            num_joins += 1;

            if (b > a) {
                /* We haven't yet tried appending any spans to nearest_line, so
                the new extended line_a needs checking again. */
                a -= 1;
            }
            outfx("new line is:\n    %s", line_string2(line_a));
        }
    }

    {
        /* Remove empty lines left behind after we appended pairs of lines. */
        int from;
        int to;
        int lines_num_old;
        for (from=0, to=0; from<lines_num; ++from) {
            if (lines[from]) {
                outfx("final line from=%i: %s",
                        from,
                        lines[from] ? line_string(lines[from]) : "NULL"
                        );
                lines[to] = lines[from];
                to += 1;
            }
        }
        lines_num_old = lines_num;
        lines_num = to;
        if (extract_realloc2(
                alloc,
                &lines,
                sizeof(line_t*) * lines_num_old,
                sizeof(line_t*) * lines_num
                )) {
            /* Should always succeed because we're not increasing allocation size. */
            goto end;
        }
    }

    *o_lines = lines;
    *o_lines_num = lines_num;
    ret = 0;

    outf("Turned %i spans into %i lines. num_compatible=%i",
            spans_num,
            lines_num,
            num_compatible
            );

    end:
    if (ret) {
        /* Free everything. */
        if (lines) {
            for (a=0; a<lines_num; ++a) {
                if (lines[a])   extract_free(alloc, &lines[a]->spans);
                extract_free(alloc, &lines[a]);
            }
        }
        extract_free(alloc, &lines);
    }
    return ret;
}


/* Returns max font size of all span_t's in an line_t. */
static double line_font_size_max(line_t* line)
{
    double  size_max = 0;
    int i;
    for (i=0; i<line->spans_num; ++i) {
        span_t* span = line->spans[i];
        /* fixme: <size> should be double, which changes some output. */
        double size = matrix_expansion(span->trm);
        if (size > size_max) {
            size_max = size;
        }
    }
    return size_max;
}



/* Find distance between parallel lines line_a and line_b, both at <angle>.

        _-R
     _-
    A------------_P
     \        _-
      \    _B
       \_-
        Q

A is (ax, ay)
B is (bx, by)
APB and PAR are both <angle>.

AR and QBP are parallel, and are the lines of text a and b
respectively.

AQB is a right angle. We need to find AQ.
*/
static double line_distance(
        double ax,
        double ay,
        double bx,
        double by,
        double angle
        )
{
    double dx = bx - ax;
    double dy = by - ay;


    return dx * sin(angle) + dy * cos(angle);
}


/* A comparison function for use with qsort(), for sorting paragraphs within a
page. */
static int paragraphs_cmp(const void* a, const void* b)
{
    const paragraph_t* const* a_paragraph = a;
    const paragraph_t* const* b_paragraph = b;
    line_t* a_line = paragraph_line_first(*a_paragraph);
    line_t* b_line = paragraph_line_first(*b_paragraph);

    span_t* a_span = line_span_first(a_line);
    span_t* b_span = line_span_first(b_line);

    /* If ctm matrices differ, always return this diff first. Note that we
    ignore .e and .f because if data is from ghostscript then .e and .f vary
    for each span, and we don't care about these differences. */
    int d = matrix_cmp4(&a_span->ctm, &b_span->ctm);
    if (d)  return d;

    {
        double a_angle = line_angle(a_line);
        double b_angle = line_angle(b_line);
        if (fabs(a_angle - b_angle) > 3.14/2) {
            /* Give up if more than 90 deg. */
            return 0;
        }
        {
            double angle = (a_angle + b_angle) / 2;
            double ax = line_item_first(a_line)->x;
            double ay = line_item_first(a_line)->y;
            double bx = line_item_first(b_line)->x;
            double by = line_item_first(b_line)->y;
            double distance = line_distance(ax, ay, bx, by, angle);
            if (distance > 0)   return -1;
            if (distance < 0)   return +1;
        }
    }
    return 0;
}


/* Creates a representation of line_t's that consists of a list of
paragraph_t's.

We only join lines that are at the same angle and are adjacent.

On entry:
    Original value of *o_paragraphs and *o_paragraphs_num are ignored.

    <lines> points to array of <lines_num> line_t*'s, each pointing to
    a line_t.

On exit:
    On sucess, returns zero, *o_paragraphs points to array of *o_paragraphs_num
    paragraph_t*'s, each pointing to an paragraph_t. In the
    array, paragraph_t's with same angle are sorted.

    On failure, returns -1 with errno set. *o_paragraphs and *o_paragraphs_num
    are undefined.
*/
static int make_paragraphs(
        extract_alloc_t*    alloc, 
        line_t**            lines,
        int                 lines_num,
        paragraph_t***      o_paragraphs,
        int*                o_paragraphs_num
        )
{
    int ret = -1;
    int a;
    int num_joins;
    paragraph_t** paragraphs = NULL;

    /* Start off with an paragraph_t for each line_t. */
    int paragraphs_num = lines_num;
    if (extract_malloc(alloc, &paragraphs, sizeof(*paragraphs) * paragraphs_num)) goto end;
    /* Ensure we can clean up after error when setting up. */
    for (a=0; a<paragraphs_num; ++a) {
        paragraphs[a] = NULL;
    }
    /* Set up initial paragraphs. */
    for (a=0; a<paragraphs_num; ++a) {
        if (extract_malloc(alloc, &paragraphs[a], sizeof(paragraph_t))) goto end;
        paragraphs[a]->lines_num = 0;
        if (extract_malloc(alloc, &paragraphs[a]->lines, sizeof(line_t*) * 1)) goto end;
        paragraphs[a]->lines_num = 1;
        paragraphs[a]->lines[0] = lines[a];
    }

    num_joins = 0;
    for (a=0; a<paragraphs_num; ++a) {
        paragraph_t* nearest_paragraph;
        int nearest_paragraph_b;
        double nearest_paragraph_distance;
        line_t* line_a;
        double angle_a;
        int verbose;
        int b;
        
        paragraph_t* paragraph_a = paragraphs[a];
        if (!paragraph_a) {
            /* This paragraph is empty - already been appended to a different
            paragraph. */
            continue;
        }

        nearest_paragraph = NULL;
        nearest_paragraph_b = -1;
        nearest_paragraph_distance = -1;
        assert(paragraph_a->lines_num > 0);

        line_a = paragraph_line_last(paragraph_a);
        angle_a = line_angle(line_a);

        verbose = 0;

        /* Look for nearest paragraph_t that could be appended to
        paragraph_a. */
        for (b=0; b<paragraphs_num; ++b) {
            paragraph_t* paragraph_b = paragraphs[b];
            line_t* line_b;
            if (!paragraph_b) {
                /* This paragraph is empty - already been appended to a different
                paragraph. */
                continue;
            }
            line_b = paragraph_line_first(paragraph_b);
            if (!lines_are_compatible(line_a, line_b, angle_a, 0)) {
                continue;
            }

            {
                double ax = line_item_last(line_a)->x;
                double ay = line_item_last(line_a)->y;
                double bx = line_item_first(line_b)->x;
                double by = line_item_first(line_b)->y;
                double distance = line_distance(ax, ay, bx, by, angle_a);
                if (verbose) {
                    outf(
                            "angle_a=%f a=(%f %f) b=(%f %f) delta=(%f %f) distance=%f:",
                            angle_a * 180 / pi,
                            ax, ay,
                            bx, by,
                            bx - ax,
                            by - ay,
                            distance
                            );
                    outf("    line_a=%s", line_string2(alloc, line_a));
                    outf("    line_b=%s", line_string2(alloc, line_b));
                }
                if (distance > 0) {
                    if (nearest_paragraph_distance == -1
                            || distance < nearest_paragraph_distance) {
                        if (verbose) {
                            outf("updating nearest. distance=%f:", distance);
                            outf("    line_a=%s", line_string2(alloc, line_a));
                            outf("    line_b=%s", line_string2(alloc, line_b));
                        }
                        nearest_paragraph_distance = distance;
                        nearest_paragraph_b = b;
                        nearest_paragraph = paragraph_b;
                    }
                }
            }
        }

        if (nearest_paragraph) {
            double line_b_size = line_font_size_max(
                    paragraph_line_first(nearest_paragraph)
                    );
            line_t* line_b = paragraph_line_first(nearest_paragraph);
            (void) line_b; /* Only used in outfx(). */
            if (nearest_paragraph_distance < 1.4 * line_b_size) {
                /* Paragraphs are close together vertically compared to maximum
                font size of first line in second paragraph, so we'll join them
                into a single paragraph. */
                span_t* a_span;
                int a_lines_num_new;
                if (verbose) {
                    outf(
                            "joing paragraphs. a=(%f,%f) b=(%f,%f) nearest_paragraph_distance=%f line_b_size=%f",
                            line_item_last(line_a)->x,
                            line_item_last(line_a)->y,
                            line_item_first(line_b)->x,
                            line_item_first(line_b)->y,
                            nearest_paragraph_distance,
                            line_b_size
                            );
                    outf("    %s", paragraph_string(alloc, paragraph_a));
                    outf("    %s", paragraph_string(alloc, nearest_paragraph));
                    outf("paragraph_a ctm=%s",
                            matrix_string(&paragraph_a->lines[0]->spans[0]->ctm)
                            );
                    outf("paragraph_a trm=%s",
                            matrix_string(&paragraph_a->lines[0]->spans[0]->trm)
                            );
                }
                /* Join these two paragraph_t's. */
                a_span = line_span_last(line_a);
                if (span_char_last(a_span)->ucs == '-') {
                    /* remove trailing '-' at end of prev line. char_t doesn't
                    contain any malloc-heap pointers so this doesn't leak. */
                    a_span->chars_num -= 1;
                }
                else {
                    /* Insert space before joining adjacent lines. */
                    char_t* c_prev;
                    char_t* c;
                    if (span_append_c(alloc, line_span_last(line_a), ' ')) goto end;
                    c_prev = &a_span->chars[ a_span->chars_num-2];
                    c = &a_span->chars[ a_span->chars_num-1];
                    c->x = c_prev->x + c_prev->adv * a_span->ctm.a;
                    c->y = c_prev->y + c_prev->adv * a_span->ctm.c;
                }

                a_lines_num_new = paragraph_a->lines_num + nearest_paragraph->lines_num;
                if (extract_realloc2(
                        alloc,
                        &paragraph_a->lines,
                        sizeof(line_t*) * paragraph_a->lines_num,
                        sizeof(line_t*) * a_lines_num_new
                        )) goto end;
                {
                    int i;
                    for (i=0; i<nearest_paragraph->lines_num; ++i) {
                        paragraph_a->lines[paragraph_a->lines_num + i]
                            = nearest_paragraph->lines[i];
                    }
                }
                paragraph_a->lines_num = a_lines_num_new;

                /* Ensure that we skip nearest_paragraph in future. */
                extract_free(alloc, &nearest_paragraph->lines);
                extract_free(alloc, &nearest_paragraph);
                paragraphs[nearest_paragraph_b] = NULL;

                num_joins += 1;
                outfx(
                        "have joined paragraph a=%i to snearest_paragraph_b=%i",
                        a,
                        nearest_paragraph_b
                        );

                if (nearest_paragraph_b > a) {
                    /* We haven't yet tried appending any paragraphs to
                    nearest_paragraph_b, so the new extended paragraph_a needs
                    checking again. */
                    a -= 1;
                }
            }
            else {
                outfx(
                        "Not joining paragraphs. nearest_paragraph_distance=%f line_b_size=%f",
                        nearest_paragraph_distance,
                        line_b_size
                        );
            }
        }
    }

    {
        /* Remove empty paragraphs. */
        int from;
        int to;
        int paragraphs_num_old;
        for (from=0, to=0; from<paragraphs_num; ++from) {
            if (paragraphs[from]) {
                paragraphs[to] = paragraphs[from];
                to += 1;
            }
        }
        outfx("paragraphs_num=%i => %i", paragraphs_num, to);
        paragraphs_num_old = paragraphs_num;
        paragraphs_num = to;
        if (extract_realloc2(
                alloc,
                &paragraphs,
                sizeof(paragraph_t*) * paragraphs_num_old,
                sizeof(paragraph_t*) * paragraphs_num
                )) {
            /* Should always succeed because we're not increasing allocation size, but
            can fail with memento squeeze. */
            goto end;
        }
    }

    /* Sort paragraphs so they appear in correct order, using paragraphs_cmp().
    */
    qsort(
            paragraphs,
            paragraphs_num,
            sizeof(paragraph_t*), paragraphs_cmp
            );

    *o_paragraphs = paragraphs;
    *o_paragraphs_num = paragraphs_num;
    ret = 0;
    outf("Turned %i lines into %i paragraphs",
            lines_num,
            paragraphs_num
            );


    end:

    if (ret) {
        if (paragraphs) {
            for (a=0; a<paragraphs_num; ++a) {
                if (paragraphs[a])   extract_free(alloc, &paragraphs[a]->lines);
                extract_free(alloc, &paragraphs[a]);
            }
        }
        extract_free(alloc, &paragraphs);
    }
    return ret;
}

int extract_document_join(extract_alloc_t* alloc, document_t* document)
{
    int ret = -1;

    /* For each page in <document> we join spans into lines and paragraphs. A
    line is a list of spans that are at the same angle and on the same line. A
    paragraph is a list of lines that are at the same angle and close together.
    */
    int p;
    for (p=0; p<document->pages_num; ++p) {
        extract_page_t* page = document->pages[p];
        outf("processing page %i: num_spans=%i", p, page->spans_num);

        if (make_lines(
                alloc,
                page->spans,
                page->spans_num,
                &page->lines,
                &page->lines_num
                )) goto end;

        if (make_paragraphs(
                alloc,
                page->lines,
                page->lines_num,
                &page->paragraphs,
                &page->paragraphs_num
                )) goto end;
    }

    ret = 0;

    end:

    return ret;
}