From edf7f9248efee5fdf8133ef3b1903a28b76ca730 Mon Sep 17 00:00:00 2001 From: Robin Stuart Date: Tue, 10 Dec 2019 21:15:23 +0000 Subject: [PATCH] Ultracode codeword generation Supports Reader Init, ECI, GS1 and handles switching between 8-bit, ASCII and C43 modes with C43 macros --- backend/ultra.c | 581 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 580 insertions(+), 1 deletion(-) diff --git a/backend/ultra.c b/backend/ultra.c index da5cfb38..47c2ea69 100644 --- a/backend/ultra.c +++ b/backend/ultra.c @@ -29,12 +29,591 @@ SUCH DAMAGE. */ + /* This version was developed using AIMD/TSC15032-43 v0.99c Edit 60, dated 4th Nov 2015 */ + +#ifdef _MSC_VER +#include +#endif #include #include #include "common.h" -int ultracode(struct zint_symbol *symbol, const unsigned char source[], const size_t in_length) { +#define EIGHTBIT_MODE 10 +#define ASCII_MODE 20 +#define C43_MODE 30 +#define PREDICT_WINDOW 12 + +static const char fragment[27][14] = {"http://", "https://", "http://www.", "https://www.", + "ftp://", "www.", ".com", ".edu", ".gov", ".int", ".mil", ".net", ".org", + ".mobi", ".coop", ".biz", ".info", "mailto:", "tel:", ".cgi", ".asp", + ".aspx", ".php", ".htm", ".html", ".shtml", "file:"}; + +static const char ultra_c43_set1[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,%"; +static const char ultra_c43_set2[] = "abcdefghijklmnopqrstuvwxyz:/?#[]@=_~!.,-"; +static const char ultra_c43_set3[] = "{}`()\"+'<>|$;&\\^*"; +static const char ultra_digit[] = "0123456789,/"; + +int ultra_find_fragment(unsigned char source[], int source_length, int position) { + int retval = -1; + int j, k, latch; + + for (j = 0; j < 27; j++) { + latch = 0; + if ((position + strlen(fragment[j])) <= source_length) { + latch = 1; + for (k = 0; k < strlen(fragment[j]); k++) { + if (source[position + k] != fragment[j][k]) { + latch = 0; + } + } + } + + if (latch) { + retval = j; + } + } + + return retval; +} + +/* Encode characters in 8-bit mode */ +float look_ahead_eightbit(unsigned char source[], int in_length, int in_posn, char current_mode, int end_char, int cw[], int* cw_len) +{ + int codeword_count = 0; + int i; + int letters_encoded = 0; + + if (current_mode != EIGHTBIT_MODE) { + cw[codeword_count] = 282; // Unlatch + codeword_count += 1; + } + + i = in_posn; + do { + cw[codeword_count] = source[i]; + i++; + codeword_count++; + } while ((i < in_length) && (i < end_char)); + + letters_encoded = i - in_posn; + + //printf("8BIT FRAG: "); + //for (i = 0; i < codeword_count; i++) { + // printf("%d ", cw[i]); + //} + //printf("\n"); + + *cw_len = codeword_count; + + //printf("%d letters in %d codewords\n", letters_encoded, codeword_count); + if (codeword_count == 0) { + return 0.0; + } else { + return (float)letters_encoded / (float)codeword_count; + } +} + +/* Encode character in the ASCII mode/submode (including numeric compression) */ +float look_ahead_ascii(unsigned char source[], int in_length, int in_posn, char current_mode, int symbol_mode, int end_char, int cw[], int* cw_len){ + int codeword_count = 0; + int i; + int first_digit, second_digit; + int letters_encoded = 0; + + if (current_mode == EIGHTBIT_MODE) { + cw[codeword_count] = 267; // Latch ASCII Submode + codeword_count++; + } + + if (current_mode == C43_MODE) { + cw[codeword_count] = 282; // Unlatch + codeword_count++; + if (symbol_mode == EIGHTBIT_MODE) { + cw[codeword_count] = 267; // Latch ASCII Submode + codeword_count++; + } + } + + i = in_posn; + do { + /* Check for double digits */ + if (in_posn != (in_length - 1)) { + first_digit = posn(ultra_digit, source[i]); + second_digit = posn(ultra_digit, source[i + 1]); + if ((first_digit != -1) && (second_digit != -1)) { + /* Double digit can be encoded */ + if ((first_digit >= 0) && (first_digit <= 9) && (second_digit >= 0) && (second_digit <= 9)) { + /* Double digit numerics */ + cw[codeword_count] = (10 * first_digit) + second_digit + 128; + codeword_count++; + i += 2; + } else if ((first_digit >= 0) && (first_digit <= 9) && (second_digit == 10)) { + /* Single digit followed by selected decimal point character */ + cw[codeword_count] = first_digit + 228; + codeword_count++; + i += 2; + } else if ((first_digit == 10) && (second_digit >= 0) && (second_digit <= 9)) { + /* Selected decimal point character followed by single digit */ + cw[codeword_count] = second_digit + 238; + codeword_count++; + i += 2; + } else if ((first_digit >= 0) && (first_digit <= 10) && (second_digit == 11)) { + /* Single digit or decimal point followed by field deliminator */ + cw[codeword_count] = first_digit + 248; + codeword_count++; + i += 2; + } else if ((first_digit == 11) && (second_digit >= 0) && (second_digit <= 10)) { + /* Field deliminator followed by single digit or decimal point */ + cw[codeword_count] = second_digit + 259; + codeword_count++; + i += 2; + } + } + } + + if (source[i] < 0x7F) { + cw[codeword_count] = source[i]; + codeword_count++; + i++; + } + } while ((i < in_length) && (i < end_char) && (source[i] < 0x80)); + + letters_encoded = i - in_posn; + + //printf("ASCII FRAG: "); + //for (i = 0; i < codeword_count; i++) { + // printf("%d ", cw[i]); + //} + //printf("\n"); + + *cw_len = codeword_count; + + //printf("%d letters in %d codewords\n", letters_encoded, codeword_count); + if (codeword_count == 0) { + return 0.0; + } else { + return (float)letters_encoded / (float)codeword_count; + } +} + +int get_subset(unsigned char source[], int in_length, int in_posn) { + int fragno; + int subset = 0; + + if (posn(ultra_c43_set1, source[in_posn]) != -1) { + subset = 1; + } + + if (posn(ultra_c43_set2, source[in_posn]) != -1) { + subset = 2; + } + + if (posn(ultra_c43_set3, source[in_posn]) != -1) { + subset = 3; + } + + fragno = ultra_find_fragment(source, in_length, in_posn); + if ((fragno != -1) && (fragno != 26)) { + subset = 3; + } + + return subset; +} + +/* Encode characters in the C43 compaction submode */ +float look_ahead_c43(unsigned char source[], int in_length, int in_posn, char current_mode, int end_char, int cw[], int* cw_len){ + int codeword_count = 0; + int subcodeword_count = 0; + int i; + int subset = 0; + int fragno; + int subposn = in_posn; + int new_subset; + int unshift_set; + int base43_value; + int letters_encoded = 0; + int pad; + +#ifndef _MSC_VER + int subcw[in_length]; +#else + int * subcw = (int *) _alloca(in_length * sizeof (int)); +#endif /* _MSC_VER */ + + subset = get_subset(source, in_length, subposn); + + if (subset == 0) { + return 0.0; + } + + if (current_mode == EIGHTBIT_MODE) { + /* Check for permissable URL C43 macro sequences, otherwise encode directly */ + fragno = ultra_find_fragment(source, in_length, subposn); + + if ((fragno == 2) || (fragno == 3)) { + // http://www. > http:// + // https://www. > https:// + fragno -= 2; + } + + switch(fragno) { + case 17: // mailto: + cw[codeword_count] = 276; + subposn += strlen(fragment[fragno]); + codeword_count++; + break; + case 18: // tel: + cw[codeword_count] = 277; + subposn += strlen(fragment[fragno]); + codeword_count++; + break; + case 26: // file: + cw[codeword_count] = 278; + subposn += strlen(fragment[fragno]); + codeword_count++; + break; + case 0: // http:// + cw[codeword_count] = 279; + subposn += strlen(fragment[fragno]); + codeword_count++; + break; + case 1: // https:// + cw[codeword_count] = 280; + subposn += strlen(fragment[fragno]); + codeword_count++; + break; + case 4: // ftp:// + cw[codeword_count] = 281; + subposn += strlen(fragment[fragno]); + codeword_count++; + break; + default: + if (subset == 1) { + cw[codeword_count] = 260; // C43 Compaction Submode C1 + codeword_count++; + } + + if ((subset == 2) || (subset == 3)) { + cw[codeword_count] = 266; // C43 Compaction Submode C2 + codeword_count++; + } + break; + } + } + + if (current_mode == ASCII_MODE) { + if (subset == 1) { + cw[codeword_count] = 278; // C43 Compaction Submode C1 + codeword_count++; + } + + if ((subset == 2) || (subset == 3)) { + cw[codeword_count] = 280; // C43 Compaction Submode C2 + codeword_count++; + } + } + unshift_set = subset; + + do { + if (subset == 1) { + subcw[subcodeword_count] = posn(ultra_c43_set1, source[subposn]); + subcodeword_count++; + subposn++; + } + + if (subset == 2) { + subcw[subcodeword_count] = posn(ultra_c43_set2, source[subposn]); + subcodeword_count++; + subposn++; + } + + if (subset == 3) { + subcw[subcodeword_count] = 41; // Shift to set 3 + subcodeword_count++; + + fragno = ultra_find_fragment(source, in_length, subposn); + if (fragno == 26) { + fragno = -1; + } + if ((fragno >= 0) && (fragno <= 18)) { + subcw[subcodeword_count] = fragno; + subcodeword_count++; + subposn += strlen(fragment[fragno]); + } + if ((fragno >= 18) && (fragno <= 25)) { + subcw[subcodeword_count] = fragno + 17; + subcodeword_count++; + subposn += strlen(fragment[fragno]); + } + if (fragno == -1) { + subcw[subcodeword_count] = posn(ultra_c43_set3, source[subposn]); + subcodeword_count++; + subposn++; + } + subset = unshift_set; + } + + if (subposn < in_length) { + new_subset = get_subset(source, in_length, subposn); + + if (((subset == 1) && (new_subset == 2)) && ((source[subposn] == '.') || (source[subposn] == ','))) { + new_subset = 1; + } + + if ((new_subset != subset) && ((new_subset == 1) || (new_subset == 2))) { + subcw[subcodeword_count] = 42; // Latch to other C43 set + subcodeword_count++; + unshift_set = new_subset; + } + + subset = new_subset; + } + } while ((subposn < in_length) && (subposn < end_char) && (subset != 0)); + + pad = 3 - (subcodeword_count % 3); + if (pad == 3) { + pad = 0; + } + //printf("Pad = %d\n", pad); + for (i = 0; i < pad; i++) { + subcw[subcodeword_count] = 42; // Latch to other C43 set used as pad + subcodeword_count++; + } + + letters_encoded = subposn - in_posn; + + //printf("C43 SUBFRAG: "); + //for (i = 0; i < subcodeword_count; i++) { + // printf("%d ", subcw[i]); + //} + //printf("\n"); + + for (i = 0; i < subcodeword_count; i += 3) { + base43_value = (43 * 43 * subcw[i]) + (43 * subcw[i + 1]) + subcw[i + 2]; + cw[codeword_count] = base43_value / 282; + codeword_count++; + cw[codeword_count] = base43_value % 282; + codeword_count++; + } + + //printf("C43 FRAG: "); + //for (i = 0; i < codeword_count; i++) { + // printf("%d ", cw[i]); + //} + //printf("\n"); + + *cw_len = codeword_count; + + //printf("%d letters in %d codewords\n", letters_encoded, codeword_count); + if (codeword_count == 0) { + return 0.0; + } else { + return (float)letters_encoded / (float)codeword_count; + } +} + +/* Produces a set of codewords which are "somewhat" optimised - this could be improved on */ +int ultra_generate_codewords(struct zint_symbol *symbol, const unsigned char source[], const size_t in_length, int codewords[]) { + int i; + int crop_length; + int codeword_count = 0; + int input_posn = 0; + char symbol_mode; + char current_mode; + float eightbit_score; + float ascii_score; + float c43_score; + int end_char; + int block_length; + int fragment_length; + int fragno; + +#ifndef _MSC_VER + unsigned char crop_source[in_length]; + char mode[in_length]; + int cw_fragment[in_length]; +#else + unsigned char * crop_source = (unsigned char *) _alloca(in_length * sizeof (unsigned char)); + char * mode = (char *) _alloca(in_length * sizeof (char)); + int * cw_fragment = (int *) _alloca(in_length * sizeof (int)); +#endif /* _MSC_VER */ + + /* Section 7.6.2 indicates that ECI \000003 to \811799 are supported */ + /* but this seems to contradict Table 5 which only shows up to \000898 */ + if (symbol->eci > 898) { + strcpy(symbol->errtxt, "ECI value out of range"); + return ZINT_ERROR_INVALID_OPTION; + } + + // Decide start character codeword (from Table 5) + symbol_mode = ASCII_MODE; + for (i = 0; i < in_length; i++) { + if (source[i] >= 0x80) { + symbol_mode = EIGHTBIT_MODE; + } + } + + if (symbol->output_options & READER_INIT) { + /* Reader Initialisation mode */ + if (symbol_mode == ASCII_MODE) { + codewords[0] = 272; // 7-bit ASCII mode + codewords[1] = 271; // FNC3 + } else { + codewords[0] = 257; // 8859-1 + codewords[1] = 269; // FNC3 + } + codeword_count = 2; + } else { + /* Calculate start character codeword */ + if (symbol_mode == ASCII_MODE) { + if (symbol->input_mode == GS1_MODE) { + codewords[0] = 273; + } else { + codewords[0] = 272; + } + } else { + if ((symbol->eci >= 3) && (symbol->eci <= 18) && (symbol->eci != 14)) { + // ECI indicate use of character set within ISO/IEC 8859 + codewords[0] = 257 + (symbol->eci - 3); + if (codewords[0] > 267) { + // Avoids ECI 14 for non-existant ISO/IEC 8859-12 + codewords[0]--; + } + } else if (symbol->eci > 18) { + // ECI indicates use of character set outside ISO/IEC 8859 + codewords[0] = 273 + (symbol->eci / 256); + codewords[1] = symbol->eci % 256; + codeword_count++; + } else { + codewords[0] = 257; // Default is assumed to be ISO/IEC 8859-1 (ECI 3) + } + } + + if ((codewords[0] == 257) || (codewords[0] == 272)) { + fragno = ultra_find_fragment((unsigned char *)source, in_length, 0); + + // Check for http:// at start of input + if ((fragno == 0) || (fragno == 2)) { + codewords[0] = 281; + input_posn = 7; + symbol_mode = EIGHTBIT_MODE; + } + + + // Check for https:// at start of input + if ((fragno == 1) || (fragno == 3)) { + codewords[0] = 282; + input_posn = 8; + symbol_mode = EIGHTBIT_MODE; + } + } + } + + codeword_count++; + + /* TODO: Check for 06 Macro Sequence and crop accordingly */ + + /* Make a cropped version of input data - removes http:// and https:// if needed */ + for (i = input_posn; i < in_length; i++) { + crop_source[i - input_posn] = source[i]; + } + crop_length = in_length - input_posn; + crop_source[crop_length] = '\0'; + + /* Attempt encoding in all three modes to see which offers best compaction and store results */ + current_mode = symbol_mode; + input_posn = 0; + do { + end_char = input_posn + PREDICT_WINDOW; + eightbit_score = look_ahead_eightbit(crop_source, crop_length, input_posn, current_mode, end_char, cw_fragment, &fragment_length); + ascii_score = look_ahead_ascii(crop_source, crop_length, input_posn, current_mode, symbol_mode, end_char, cw_fragment, &fragment_length); + c43_score = look_ahead_c43(crop_source, crop_length, input_posn, current_mode, end_char, cw_fragment, &fragment_length); + + mode[input_posn] = 'a'; + current_mode = ASCII_MODE; + + if ((c43_score > ascii_score) && (c43_score > eightbit_score)) { + mode[input_posn] = 'c'; + current_mode = C43_MODE; + } + + if ((eightbit_score > ascii_score) && (eightbit_score > c43_score)) { + mode[input_posn] = '8'; + current_mode = EIGHTBIT_MODE; + } + input_posn++; + } while (input_posn < crop_length); + mode[input_posn] = '\0'; + + /* Use results from test to perform actual mode switching */ + current_mode = symbol_mode; + input_posn = 0; + do { + block_length = 0; + do { + block_length++; + } while (mode[input_posn + block_length] == mode[input_posn]); + + switch(mode[input_posn]) { + case 'a': + ascii_score = look_ahead_ascii(crop_source, crop_length, input_posn, current_mode, symbol_mode, input_posn + block_length, cw_fragment, &fragment_length); + current_mode = ASCII_MODE; + break; + case 'c': + c43_score = look_ahead_c43(crop_source, crop_length, input_posn, current_mode, input_posn + block_length, cw_fragment, &fragment_length); + + /* Substitute temporary latch if possible */ + if ((current_mode == EIGHTBIT_MODE) && (cw_fragment[0] == 261) && (fragment_length >= 5) && (fragment_length <= 11)) { + /* Temporary latch to submode 1 from Table 11 */ + cw_fragment[0] = 256 + ((fragment_length - 5) / 2); + } else if ((current_mode == EIGHTBIT_MODE) && (cw_fragment[0] == 266) && (fragment_length >= 5) && (fragment_length <= 11)) { + /* Temporary latch to submode 2 from Table 11 */ + cw_fragment[0] = 262 + ((fragment_length - 5) / 2); + } else if ((current_mode == ASCII_MODE) && (cw_fragment[0] == 278) && (fragment_length >= 5) && (fragment_length <= 11)) { + /* Temporary latch to submode 1 from Table 9 */ + cw_fragment[0] = 274 + ((fragment_length - 5) / 2); + } else { + current_mode = C43_MODE; + } + break; + case '8': + eightbit_score = look_ahead_eightbit(crop_source, crop_length, input_posn, current_mode, input_posn + block_length, cw_fragment, &fragment_length); + current_mode = EIGHTBIT_MODE; + break; + } + + for (i = 0; i < fragment_length; i++) { + codewords[codeword_count + i] = cw_fragment[i]; + } + codeword_count += fragment_length; + + input_posn += block_length; + } while (input_posn < crop_length); + + //printf("RED: %s\n", crop_source); + //printf("MOD: %s\n", mode); + + return codeword_count; +} + +int ultracode(struct zint_symbol *symbol, const unsigned char source[], const size_t in_length) { + int data_cw_count = 0; + +#ifndef _MSC_VER + int data_codewords[in_length * 2]; +#else + int* data_codewords = (int *) _alloca(in_length * 2 * sizeof (int)); +#endif /* _MSC_VER */ + + data_cw_count = ultra_generate_codewords(symbol, source, in_length, data_codewords); + + printf("Codewords returned = %d\n", data_cw_count); + + for (int i = 0; i < data_cw_count; i++) { + printf("%d ", data_codewords[i]); + } + printf("\n"); strcpy(symbol->errtxt, "1000: Ultracode has not been implemented - yet!"); return ZINT_ERROR_INVALID_OPTION;