diff options
Diffstat (limited to 'extract/src/document.h')
-rw-r--r-- | extract/src/document.h | 150 |
1 files changed, 150 insertions, 0 deletions
diff --git a/extract/src/document.h b/extract/src/document.h new file mode 100644 index 00000000..7a1470e4 --- /dev/null +++ b/extract/src/document.h @@ -0,0 +1,150 @@ +#ifndef ARTIFEX_EXTRACT_DOCUMENT_H +#define ARTIFEX_EXTRACT_DOCUMENT_H + +static const double pi = 3.141592653589793; + +typedef struct +{ + double x; + double y; +} point_t; + +typedef struct +{ + double a; + double b; + double c; + double d; + double e; + double f; +} matrix_t; + +double matrix_expansion(matrix_t m); + +int matrix_cmp4(const matrix_t* lhs, const matrix_t* rhs) +; +/* Returns zero if first four members of *lhs and *rhs are equal, otherwise ++/-1. */ + +typedef struct +{ + /* (x,y) before transformation by ctm and trm. */ + double pre_x; + double pre_y; + + /* (x,y) after transformation by ctm and trm. */ + double x; + double y; + + unsigned ucs; + double adv; +} char_t; +/* A single char in a span. +*/ + +typedef struct +{ + matrix_t ctm; + matrix_t trm; + char* font_name; + + /* font size is matrix_expansion(trm). */ + + struct { + unsigned font_bold : 1; + unsigned font_italic : 1; + unsigned wmode : 1; + }; + + char_t* chars; + int chars_num; +} span_t; +/* List of chars that have same font and are usually adjacent. */ + +char_t* span_char_last(span_t* span); +/* Returns last character in span. */ + +int span_append_c(extract_alloc_t* alloc, span_t* span, int c); +/* Appends new char_t to an span_t with .ucs=c and all other +fields zeroed. */ + +const char* span_string(extract_alloc_t* alloc, span_t* span); +/* Returns static string containing info about span_t. */ + +typedef struct +{ + span_t** spans; + int spans_num; +} line_t; +/* List of spans that are aligned on same line. */ + +span_t* line_span_first(line_t* line); +/* Returns first span in a line. */ + +span_t* line_span_last(line_t* line); +/* Returns last span in a line. */ + +typedef struct +{ + line_t** lines; + int lines_num; +} paragraph_t; +/* List of lines that are aligned and adjacent to each other so as to form a +paragraph. */ + +typedef struct +{ + char* type; /* jpg, png etc. */ + char* name; /* Name of image file within docx. */ + char* id; /* ID of image within docx. */ + char* data; + size_t data_size; + + extract_image_data_free data_free; + void* data_free_handle; + +} image_t; +/* Information about an image. <type> is as passed to extract_add_image(); +<name> and <id> are created to be unique identifiers for use in generated docx +file. */ + +typedef struct +{ + span_t** spans; + int spans_num; + + image_t* images; + int images_num; + + line_t** lines; + int lines_num; + /* These refer to items in .spans. Initially empty, then set by + extract_join(). */ + + paragraph_t** paragraphs; + int paragraphs_num; + /* These refer to items in .lines. Initially empty, then set + by extract_join(). */ + +} page_t; +/* A page. Contains different representations of the list of spans. */ + +typedef struct +{ + page_t** pages; + int pages_num; +} document_t; +/* A list of pages. */ + + +typedef struct +{ + image_t* images; + int images_num; + char** imagetypes; + int imagetypes_num; +} images_t; + +int extract_document_join(extract_alloc_t* alloc, document_t* document); + +#endif |