From af2e8e52b82ed1b928635d9c31914c9de9bf5c44 Mon Sep 17 00:00:00 2001 From: Leonid Pliushch Date: Sun, 19 Jan 2020 19:47:26 +0200 Subject: [PATCH] new package: html2text Requested in https://github.com/termux/termux-packages/issues/4757. --- packages/html2text/build.sh | 7 + .../html2text-1.3.2a_Makefile.in.patch | 25 + .../html2text-1.3.2a_configure.patch | 147 ++++ .../patch-utf8-html2text-1.3.2a.patch | 706 ++++++++++++++++++ 4 files changed, 885 insertions(+) create mode 100644 packages/html2text/build.sh create mode 100644 packages/html2text/html2text-1.3.2a_Makefile.in.patch create mode 100644 packages/html2text/html2text-1.3.2a_configure.patch create mode 100644 packages/html2text/patch-utf8-html2text-1.3.2a.patch diff --git a/packages/html2text/build.sh b/packages/html2text/build.sh new file mode 100644 index 000000000..60fe86bf7 --- /dev/null +++ b/packages/html2text/build.sh @@ -0,0 +1,7 @@ +TERMUX_PKG_HOMEPAGE=http://www.mbayer.de/html2text/ +TERMUX_PKG_DESCRIPTION="Utility that converts HTML documents into plain text" +TERMUX_PKG_LICENSE="GPL-2.0" +TERMUX_PKG_VERSION=1.3.2 +TERMUX_PKG_SRCURL=http://www.mbayer.de/html2text/downloads/html2text-${TERMUX_PKG_VERSION}a.tar.gz +TERMUX_PKG_SHA256=000b39d5d910b867ff7e087177b470a1e26e2819920dcffd5991c33f6d480392 +TERMUX_PKG_BUILD_IN_SRC=true diff --git a/packages/html2text/html2text-1.3.2a_Makefile.in.patch b/packages/html2text/html2text-1.3.2a_Makefile.in.patch new file mode 100644 index 000000000..45d959050 --- /dev/null +++ b/packages/html2text/html2text-1.3.2a_Makefile.in.patch @@ -0,0 +1,25 @@ +diff -uNr html2text-1.3.2a/Makefile.in html2text-1.3.2a.mod/Makefile.in +--- html2text-1.3.2a/Makefile.in 2004-01-14 15:47:02.000000000 +0200 ++++ html2text-1.3.2a.mod/Makefile.in 2020-01-19 19:44:43.131479673 +0200 +@@ -29,9 +29,9 @@ + YFLAGS = + + INSTALLER = install +-BINDIR = /usr/local/bin +-MANDIR = /usr/local/man +-DOCDIR = /usr/share/doc/html2text ++BINDIR = @TERMUX_PREFIX@/bin ++MANDIR = @TERMUX_PREFIX@/share/man ++DOCDIR = @TERMUX_PREFIX@/share/doc/html2text + + CXX = @CXX@ + BOOL_DEFINITION = @BOOL_DEFINITION@ +@@ -91,7 +91,7 @@ + # This is mostly thought for RPM builts and users that don't read the documentation. + + install : +- $(INSTALLER) -s -m 755 html2text $(BINDIR); ++ $(INSTALLER) -m 755 html2text $(BINDIR); + $(INSTALLER) -m 644 html2text.1.gz $(MANDIR)/man1; + $(INSTALLER) -m 644 html2textrc.5.gz $(MANDIR)/man5; + $(INSTALLER) -d -m 755 $(DOCDIR); diff --git a/packages/html2text/html2text-1.3.2a_configure.patch b/packages/html2text/html2text-1.3.2a_configure.patch new file mode 100644 index 000000000..38f19dbc1 --- /dev/null +++ b/packages/html2text/html2text-1.3.2a_configure.patch @@ -0,0 +1,147 @@ +diff -uNr html2text-1.3.2a/configure html2text-1.3.2a.mod/configure +--- html2text-1.3.2a/configure 2004-01-12 17:47:18.000000000 +0200 ++++ html2text-1.3.2a.mod/configure 2020-01-19 19:43:44.205959803 +0200 +@@ -31,24 +31,6 @@ + # + + $echo 'Checking C++ compiler... \c'; +-cat <$tmp_file.C; +-#include +-int main(int, char **) { +- std::cout << "hello" << std::endl; +- return 0; +-} +-EOF +-CXX=unknown; +-for i in "CC" "g++" "cc" "$CC"; do +- if $i -c $tmp_file.C 2>/dev/null; then +- CXX="$i"; +- break; +- fi; +-done; +-if test "$CXX" = unknown; then +- $echo "Error: Could not find a working C++ compiler."; +- exit 1; +-fi; + $echo "use \"$CXX\""; + + # +@@ -57,7 +39,7 @@ + + $echo 'Checking ... \c'; + SYS_POLL_MISSING=unknown; +-cat <$tmp_file.C; ++cat <$tmp_file.cc; + #ifdef SYS_POLL_MISSING /* { */ + struct pollfd { int fd; short events; short revents; }; + extern "C" int poll(struct pollfd *ufds, unsigned int nfds, int timeout); +@@ -76,7 +58,7 @@ + } + EOF + for i in "" -DSYS_POLL_MISSING; do +- if $CXX $tmp_file.C $i -o $tmp_file 2>/dev/null; then ++ if $CXX $tmp_file.cc $i -o $tmp_file 2>/dev/null; then + SYS_POLL_MISSING="$i"; + break; + fi; +@@ -97,7 +79,7 @@ + + $echo 'Checking for socket libraries... \c'; + SOCKET_LIBRARIES=unknown; +-cat >$tmp_file.C <$tmp_file.cc </dev/null; then ++ if $CXX $tmp_file.cc $i -o $tmp_file 2>/dev/null; then + SOCKET_LIBRARIES="$i"; + break; + fi; +@@ -128,7 +110,7 @@ + + $echo 'Checking "bool"... \c'; + BOOL_DEFINITION=unknown; +-cat <$tmp_file.C; ++cat <$tmp_file.cc; + #ifdef BOOL_DEFINITION + BOOL_DEFINITION + #endif +@@ -144,7 +126,7 @@ + '-DBOOL_DEFINITION="typedef unsigned char bool;const bool false=0,true=1;"' \ + '-DBOOL_DEFINITION="enum bool{false,true};"'; \ + do +- if eval "$CXX $tmp_file.C $i -o $tmp_file 2>/dev/null"; then ++ if eval "$CXX $tmp_file.cc $i -o $tmp_file 2>/dev/null"; then + BOOL_DEFINITION="$i"; + break; + fi; +@@ -165,7 +147,7 @@ + + $echo 'Checking "explicit"... \c'; + EXPLICIT=unknown; +-cat <$tmp_file.C; ++cat <$tmp_file.cc; + struct C { + explicit C(int) {} + }; +@@ -175,7 +157,7 @@ + '' \ + '-Dexplicit='; \ + do +- if eval "$CXX $tmp_file.C $i -o $tmp_file 2>/dev/null"; then ++ if eval "$CXX $tmp_file.cc $i -o $tmp_file 2>/dev/null"; then + EXPLICIT="$i"; + break; + fi; +@@ -195,7 +177,7 @@ + # + + $echo 'Checking Standard C++ library... \c'; +-cat <$tmp_file.C; ++cat <$tmp_file.cc; + #include + #include + #include +@@ -207,7 +189,7 @@ + using namespace std; + void func() { map x; } + EOF +-if $CXX -c $tmp_file.C 2>/dev/null; then ++if $CXX -c $tmp_file.cc 2>/dev/null; then + LIBSTDCXX_INCLUDES=""; + LIBSTDCXX_LIBS=""; + $echo 'works; no need to make "./libstd"'; +@@ -223,7 +205,7 @@ + # + AUTO_PTR_BROKEN=""; + $echo 'Checking "auto_ptr"... \c'; +-cat <$tmp_file.C; ++cat <$tmp_file.cc; + #include + #include + #include +@@ -243,7 +225,7 @@ + return 0; + } + EOF +-if eval "$CXX -c $LIBSTDCXX_INCLUDES $EXPLICIT $BOOL_DEFINITION $tmp_file.C" 2>/dev/null; then ++if eval "$CXX -c $LIBSTDCXX_INCLUDES $EXPLICIT $BOOL_DEFINITION $tmp_file.cc" 2>/dev/null; then + $echo 'defined in , good'; + else + $echo 'not defined or not working, use "./libstd/include/auto_ptr.h"'; +@@ -255,8 +237,8 @@ + # + MAKEDEPEND_INCLUDES=""; + $echo 'Checking "makedepend" includes... \c'; +-echo "#include " >$tmp_file.C; +-MAKEDEPEND_INCLUDES=`$CXX -E $tmp_file.C 2>/dev/null | ++echo "#include " >$tmp_file.cc; ++MAKEDEPEND_INCLUDES=`$CXX -E $tmp_file.cc 2>/dev/null | + sed -n \ + -e 's/^#line .*"\(\/.*\)\/.*".*/-I\1/p' \ + -e 's/^# [1-9][0-9]* "\(\/.*\)\/.*".*/-I\1/p' | diff --git a/packages/html2text/patch-utf8-html2text-1.3.2a.patch b/packages/html2text/patch-utf8-html2text-1.3.2a.patch new file mode 100644 index 000000000..442d1871e --- /dev/null +++ b/packages/html2text/patch-utf8-html2text-1.3.2a.patch @@ -0,0 +1,706 @@ +diff -r -u -bB html2text-1.3.2a/Area.C html2text-1.3.2a-patched/Area.C +--- html2text-1.3.2a/Area.C 2003-11-23 12:05:29.000000000 +0100 ++++ html2text-1.3.2a-patched/Area.C 2005-05-13 22:19:59.862137688 +0200 +@@ -36,10 +36,13 @@ + #include + + #include "Area.h" ++#include "html.h" + #include "string.h" + + #define LATIN1_nbsp 160 + ++extern int use_encoding; ++ + /* ------------------------------------------------------------------------- */ + + #define malloc_array(type, size)\ +@@ -81,6 +84,27 @@ + + /* ------------------------------------------------------------------------- */ + ++/* utf_length() and utf_width() ++ * ++ * Very simplified algorithm of calculating length of UTF-8 ++ * string. No check for errors. Counting only ASCII bytes and ++ * leading bytes of UTF-8 multibyte sequences. All bytes like ++ * 10xxxxxx are dropped. If USE_UTF8 is false then returns ++ * usual length. --YS ++ */ ++ ++unsigned int ++Line::utf_length(size_type f, size_type t) const ++{ ++ size_type m = (t < length_ ? t : length_); ++ size_type r = m - f; ++ if(USE_UTF8) { ++ for (int i = f; i < m; i++) ++ if((cells_[i].character & 0xc0) == 0x80) r--; ++ } ++ return r; ++} ++ + void + Line::resize(size_type l) + { +@@ -236,6 +260,23 @@ + return *this; + } + ++unsigned int ++Area::utf_width() ++{ ++ size_type r = width_; ++ if(USE_UTF8) { r = 0; ++ for (size_type yy = 0; yy < height_; yy++) { ++ size_type r1 = 0; ++ for (int i = width_ - 1; i >= 0; i--) { ++ if(!r1 && isspace(cells_[yy][i].character)) continue; ++ if((cells_[yy][i].character & 0xc0) != 0x80) r1++; ++ } ++ if(r < r1) r = r1; ++ } ++ } ++ return r; ++} ++ + void + Area::resize(size_type w, size_type h) + { +@@ -439,7 +480,7 @@ + char c = p->character; + char a = p->attribute; + +- if (c == (char) LATIN1_nbsp) c = ' '; ++ if (c == (char) LATIN1_nbsp && !USE_UTF8) c = ' '; + + if (a == Cell::NONE) { + os << c; +Nur in html2text-1.3.2a-patched/: Area.C.orig. +diff -r -u -bB html2text-1.3.2a/Area.h html2text-1.3.2a-patched/Area.h +--- html2text-1.3.2a/Area.h 2003-11-23 12:05:29.000000000 +0100 ++++ html2text-1.3.2a-patched/Area.h 2005-05-13 22:19:59.863137536 +0200 +@@ -81,6 +81,8 @@ + Cell &operator[](size_type x) { return cells_[x]; } + const Cell *cells() const { return cells_; } + ++ unsigned int utf_length(size_type f, size_type t) const; ++ + void resize(size_type l); + void enlarge(size_type l) { if (l > length_) resize(l); } + +@@ -134,6 +136,8 @@ + Cell *operator[](size_type y) { return cells_[y]; } + const Area &operator>>=(size_type rs); + ++ unsigned int utf_width(); ++ + void resize(size_type w, size_type h); + void enlarge(size_type w, size_type h); + +Nur in html2text-1.3.2a-patched/: Area.h.orig. +diff -r -u -bB html2text-1.3.2a/format.C html2text-1.3.2a-patched/format.C +--- html2text-1.3.2a/format.C 2003-11-23 12:05:29.000000000 +0100 ++++ html2text-1.3.2a-patched/format.C 2005-05-13 22:19:59.865137232 +0200 +@@ -1210,6 +1210,7 @@ + } + + Line::size_type to = from + 1; ++ int to_from; + + Line::size_type lbp = (Line::size_type) -1; // "Last break position". + +@@ -1238,18 +1239,20 @@ + to++; + } + +- if (to - from > w && lbp != (Area::size_type) -1) { to = lbp; break; } ++ if (line.utf_length(from,to) > w && lbp != (Area::size_type) -1) ++ { to = lbp; break; } + } + ++ to_from = line.utf_length(from,to); + /* + * Copy the "from...to" range from the "line" to the bottom of the "res" + * Area. + */ + Area::size_type x = 0; + Area::size_type len = to - from; +- if (halign == Area::LEFT || len >= w) { ; } else +- if (halign == Area::CENTER) { x += (w - len) / 2; } else +- if (halign == Area::RIGHT) { x += w - len; } ++ if (halign == Area::LEFT || to_from >= w) { ; } else ++ if (halign == Area::CENTER) { x += (w - to_from) / 2; } else ++ if (halign == Area::RIGHT) { x += w - to_from; } + res->insert(line.cells() + from, len, x, res->height()); + + /* +Nur in html2text-1.3.2a-patched/: format.C.orig. +diff -r -u -bB html2text-1.3.2a/html2text.C html2text-1.3.2a-patched/html2text.C +--- html2text-1.3.2a/html2text.C 2003-11-23 12:05:29.000000000 +0100 ++++ html2text-1.3.2a-patched/html2text.C 2005-05-13 22:19:59.868136776 +0200 +@@ -148,9 +148,10 @@ + -o Redirect output into \n\ + -nobs Do not use backspaces for boldface and underlining\n\ + -ascii Use plain ASCII for output instead of ISO-8859-1\n\ ++ -utf8 Assume both terminal and input stream are in UTF-8 mode\n\ + "; + +-int use_iso8859 = 1; ++int use_encoding = ISO8859; + + int + main(int argc, char **argv) +@@ -199,7 +200,8 @@ + if (!strcmp(arg, "-width" )) { width = atoi(argv[++i]); } else + if (!strcmp(arg, "-o" )) { output_file_name = argv[++i]; } else + if (!strcmp(arg, "-nobs" )) { use_backspaces = false; } else +- if (!strcmp(arg, "-ascii" )) { use_iso8859 = false; } else ++ if (!strcmp(arg, "-ascii" )) { use_encoding = ASCII; } else ++ if (!strcmp(arg, "-utf8" )) { use_encoding = UTF8; } else + { + std::cerr + << "Unrecognized command line option \"" +Nur in html2text-1.3.2a-patched/: html2text.C.orig. +diff -r -u -bB html2text-1.3.2a/html.h html2text-1.3.2a-patched/html.h +--- html2text-1.3.2a/html.h 2001-10-04 22:03:54.000000000 +0200 ++++ html2text-1.3.2a-patched/html.h 2005-05-13 22:19:59.866137080 +0200 +@@ -61,6 +61,11 @@ + + /* ------------------------------------------------------------------------- */ + ++enum {ASCII, ISO8859, UTF8}; ++#define USE_ISO8859 (use_encoding == ISO8859) ++#define USE_ASCII (use_encoding == ASCII) ++#define USE_UTF8 (use_encoding == UTF8) ++ + #define LATIN1_nbsp 160 + #define LATIN1_iexcl 161 + #define LATIN1_cent 162 +diff -r -u -bB html2text-1.3.2a/sgml.C html2text-1.3.2a-patched/sgml.C +--- html2text-1.3.2a/sgml.C 2003-11-23 12:09:11.000000000 +0100 ++++ html2text-1.3.2a-patched/sgml.C 2005-05-13 22:19:59.870136472 +0200 +@@ -62,261 +62,280 @@ + char name[8]; + int iso8859code; + char *asciistr; ++ unsigned long unicode; + } entities[] = { +- { "AElig", LATIN1_AElig, "AE" }, +- { "AMP", 0, "&" }, +- { "Aacute", LATIN1_Aacute, "A'" }, +- { "Acirc", LATIN1_Acirc, "A^" }, +- { "Agrave", LATIN1_Agrave, "A`" }, +- { "Alpha", 0, "A" }, +- { "Aring", LATIN1_Aring, "AA" }, +- { "Atilde", LATIN1_Atilde, "A~" }, +- { "Auml", LATIN1_Auml, "A\"" }, +- { "Beta", 0, "B" }, +- { "Ccedil", LATIN1_Ccedil, "C," }, +- { "Chi", 0, "H" }, +- { "Dagger", 0, "++" }, +- { "Delta", 0, "D" }, +- { "ETH", LATIN1_ETH, "D-" }, +- { "Eacute", LATIN1_Eacute, "E'" }, +- { "Ecirc", LATIN1_Ecirc, "E^" }, +- { "Egrave", LATIN1_Egrave, "E`" }, +- { "Epsilon", 0, "E" }, +- { "Eta", 0, "E" }, +- { "Euml", LATIN1_Euml, "E\"" }, +- { "GT", 0, ">" }, +- { "Gamma", 0, "G" }, +- { "Iacute", LATIN1_Iacute, "I'" }, +- { "Icirc", LATIN1_Icirc, "I^" }, +- { "Igrave", LATIN1_Igrave, "I`" }, +- { "Iota", 0, "I" }, +- { "Iuml", LATIN1_Iuml, "I\"" }, +- { "Kappa", 0, "K" }, +- { "LT", 0, "<" }, +- { "Lambda", 0, "L" }, +- { "Mu", 0, "M" }, +- { "Ntilde", LATIN1_Ntilde, "N~" }, +- { "Nu", 0, "N" }, +- { "OElig", 0, "OE" }, +- { "Oacute", LATIN1_Oacute, "O'" }, +- { "Ocirc", LATIN1_Ocirc, "O^" }, +- { "Ograve", LATIN1_Ograve, "O`" }, +- { "Omega", 0, "O" }, +- { "Omicron", 0, "O" }, +- { "Oslash", LATIN1_Oslash, "O/" }, +- { "Otilde", LATIN1_Otilde, "O~" }, +- { "Ouml", LATIN1_Ouml, "O\"" }, +- { "Phi", 0, "F" }, +- { "Pi", 0, "P" }, +- { "Prime", 0, "''" }, +- { "Psi", 0, "PS" }, +- { "QUOT", 0, "\"" }, +- { "Rho", 0, "R" }, +- { "Scaron", 0, "S" }, +- { "Sigma", 0, "S" }, +- { "THORN", LATIN1_THORN, "TH" }, +- { "Tau", 0, "T" }, +- { "Theta", 0, "TH" }, +- { "Uacute", LATIN1_Uacute, "U'" }, +- { "Ucirc", LATIN1_Ucirc, "U^" }, +- { "Ugrave", LATIN1_Ugrave, "U`" }, +- { "Upsilon", 0, "U" }, +- { "Uuml", LATIN1_Uuml, "U\"" }, +- { "Xi", 0, "X" }, +- { "Yacute", LATIN1_Yacute, "Y'" }, +- { "Yuml", 0, "Y\"" }, +- { "Zeta", 0, "Z" }, +- { "aacute", LATIN1_aacute, "a'" }, +- { "acirc", LATIN1_acirc, "a^" }, +- { "acute", LATIN1_acute, "'" }, +- { "aelig", LATIN1_aelig, "ae" }, +- { "agrave", LATIN1_agrave, "a`" }, ++ { "AElig", LATIN1_AElig, "AE", 0x00c6}, ++ { "AMP", 0, "&", 0x0026}, ++ { "Aacute", LATIN1_Aacute, "A'", 0x00c1}, ++ { "Acirc", LATIN1_Acirc, "A^", 0x00c2}, ++ { "Agrave", LATIN1_Agrave, "A`", 0x00c0}, ++ { "Alpha", 0, "A", 0x0391}, ++ { "Aring", LATIN1_Aring, "AA", 0x00c5}, ++ { "Atilde", LATIN1_Atilde, "A~", 0x00c3}, ++ { "Auml", LATIN1_Auml, "A\"", 0x00c4}, ++ { "Beta", 0, "B", 0x0392}, ++ { "Ccedil", LATIN1_Ccedil, "C,", 0x00c7}, ++ { "Chi", 0, "H", 0x03a7}, ++ { "Dagger", 0, "++", 0x2020}, ++ { "Delta", 0, "D", 0x0394}, ++ { "ETH", LATIN1_ETH, "D-", 0x00d0}, ++ { "Eacute", LATIN1_Eacute, "E'", 0x00c9}, ++ { "Ecirc", LATIN1_Ecirc, "E^", 0x00ca}, ++ { "Egrave", LATIN1_Egrave, "E`", 0x00c8}, ++ { "Epsilon", 0, "E", 0x0395}, ++ { "Eta", 0, "E", 0x0397}, ++ { "Euml", LATIN1_Euml, "E\"", 0x00cb}, ++ { "GT", 0, ">", 0x003e}, ++ { "Gamma", 0, "G", 0x0393}, ++ { "Iacute", LATIN1_Iacute, "I'", 0x00cd}, ++ { "Icirc", LATIN1_Icirc, "I^", 0x00ce}, ++ { "Igrave", LATIN1_Igrave, "I`", 0x00cc}, ++ { "Iota", 0, "I", 0x0399}, ++ { "Iuml", LATIN1_Iuml, "I\"", 0x00cf}, ++ { "Kappa", 0, "K", 0x039a}, ++ { "LT", 0, "<", 0x003c}, ++ { "Lambda", 0, "L", 0x039b}, ++ { "Mu", 0, "M", 0x039c}, ++ { "Ntilde", LATIN1_Ntilde, "N~", 0x00d1}, ++ { "Nu", 0, "N", 0x039d}, ++ { "OElig", 0, "OE", 0x0152}, ++ { "Oacute", LATIN1_Oacute, "O'", 0x00d3}, ++ { "Ocirc", LATIN1_Ocirc, "O^", 0x00d4}, ++ { "Ograve", LATIN1_Ograve, "O`", 0x00d2}, ++ { "Omega", 0, "O", 0x03a9}, ++ { "Omicron", 0, "O", 0x039f}, ++ { "Oslash", LATIN1_Oslash, "O/", 0x00d8}, ++ { "Otilde", LATIN1_Otilde, "O~", 0x00d5}, ++ { "Ouml", LATIN1_Ouml, "O\"", 0x00d6}, ++ { "Phi", 0, "F", 0x03a6}, ++ { "Pi", 0, "P", 0x03a0}, ++ { "Prime", 0, "''", }, ++ { "Psi", 0, "PS", 0x03a8}, ++ { "QUOT", 0, "\"", }, ++ { "Rho", 0, "R", 0x03a1}, ++ { "Scaron", 0, "S", 0x0161}, ++ { "Sigma", 0, "S", 0x03a3}, ++ { "THORN", LATIN1_THORN, "TH", 0x00de}, ++ { "Tau", 0, "T", 0x03a4}, ++ { "Theta", 0, "TH", 0x0398}, ++ { "Uacute", LATIN1_Uacute, "U'", 0x00da}, ++ { "Ucirc", LATIN1_Ucirc, "U^", 0x00db}, ++ { "Ugrave", LATIN1_Ugrave, "U`", 0x00d9}, ++ { "Upsilon", 0, "U", 0x03a5}, ++ { "Uuml", LATIN1_Uuml, "U\"", 0x00dc}, ++ { "Xi", 0, "X", 0x039e}, ++ { "Yacute", LATIN1_Yacute, "Y'", 0x00dd}, ++ { "Yuml", 0, "Y\"", 0x0178}, ++ { "Zeta", 0, "Z", 0x0396}, ++ { "aacute", LATIN1_aacute, "a'", 0x00e1}, ++ { "acirc", LATIN1_acirc, "a^", 0x00e2}, ++ { "acute", LATIN1_acute, "'", 0x00b4}, ++ { "aelig", LATIN1_aelig, "ae", 0x00e6}, ++ { "agrave", LATIN1_agrave, "a`", 0x00e0}, + { "alefsym", 0, "Aleph" }, +- { "alpha", 0, "a" }, ++ { "alpha", 0, "a", 0x03b1}, + { "amp", 0, "&" }, + { "and", 0, "AND" }, + { "ang", 0, "-V" }, + { "apos", 0, "'" }, +- { "aring", LATIN1_aring, "aa" }, +- { "asymp", 0, "~=" }, +- { "atilde", LATIN1_atilde, "a~" }, +- { "auml", LATIN1_auml, "a\"" }, ++ { "aring", LATIN1_aring, "aa", 0x00e5}, ++ { "asymp", 0, "~=", 0x2248}, ++ { "atilde", LATIN1_atilde, "a~", 0x00e3}, ++ { "auml", LATIN1_auml, "a\"", 0x00e5}, + { "bdquo", 0, "\"" }, +- { "beta", 0, "b" }, +- { "brvbar", LATIN1_brvbar, "|" }, +- { "bull", 0, " o " }, ++ { "beta", 0, "b", 0x03b2}, ++ { "brvbar", LATIN1_brvbar, "|", 0x00a6}, ++ { "bull", 0, " o ", 0x2022}, + { "cap", 0, "(U" }, +- { "ccedil", LATIN1_ccedil, "c," }, +- { "cedil", LATIN1_cedil, "," }, +- { "cent", LATIN1_cent, "-c-" }, +- { "chi", 0, "h" }, +- { "circ", 0, "^" }, ++ { "ccedil", LATIN1_ccedil, "c,", 0x00e7}, ++ { "cedil", LATIN1_cedil, ",", 0x00b8}, ++ { "cent", LATIN1_cent, "-c-", 0x00a2}, ++ { "chi", 0, "h", 0x03c7}, ++ { "circ", 0, "^", 0x005e}, + // { "clubs", 0, "[clubs]" }, + { "cong", 0, "?=" }, +- { "copy", LATIN1_copy, "(c)" }, ++ { "copy", LATIN1_copy, "(c)", 0x00a9}, + { "crarr", 0, "<-'" }, + { "cup", 0, ")U" }, +- { "curren", LATIN1_curren, "CUR" }, ++ { "curren", LATIN1_curren, "CUR", 0x00a4}, + { "dArr", 0, "vv" }, +- { "dagger", 0, "+" }, ++ { "dagger", 0, "+", 0x2020}, + { "darr", 0, "v" }, +- { "deg", LATIN1_deg, "DEG" }, +- { "delta", 0, "d" }, ++ { "deg", LATIN1_deg, "DEG", 0x00b0}, ++ { "delta", 0, "d", 0x03b4}, + // { "diams", 0, "[diamonds]" }, +- { "divide", LATIN1_divide, "/" }, +- { "eacute", LATIN1_eacute, "e'" }, +- { "ecirc", LATIN1_ecirc, "e^" }, +- { "egrave", LATIN1_egrave, "e`" }, ++ { "divide", LATIN1_divide, "/", 0x00f7}, ++ { "eacute", LATIN1_eacute, "e'", 0x00e9}, ++ { "ecirc", LATIN1_ecirc, "e^", 0x00ea}, ++ { "egrave", LATIN1_egrave, "e`", 0x00e8}, + { "empty", 0, "{}" }, +- { "epsilon", 0, "e" }, +- { "equiv", 0, "==" }, +- { "eta", 0, "e" }, +- { "eth", LATIN1_eth, "d-" }, +- { "euml", LATIN1_euml, "e\"" }, +- { "euro", 0, "EUR" }, ++ { "epsilon", 0, "e", 0x03b5}, ++ { "equiv", 0, "==", 0x2261}, ++ { "eta", 0, "e", 0x03b7}, ++ { "eth", LATIN1_eth, "d-", 0x00f0}, ++ { "euml", LATIN1_euml, "e\"", 0x00eb}, ++ { "euro", 0, "EUR", 0x20ac}, + { "exist", 0, "TE" }, + { "fnof", 0, "f" }, + { "forall", 0, "FA" }, +- { "frac12", LATIN1_frac12, " 1/2" }, +- { "frac14", LATIN1_frac14, " 1/4" }, +- { "frac34", LATIN1_frac34, " 3/4" }, ++ { "frac12", LATIN1_frac12, " 1/2",0x00bd}, ++ { "frac14", LATIN1_frac14, " 1/4",0x00bc}, ++ { "frac34", LATIN1_frac34, " 3/4",0x00be}, + { "frasl", 0, "/" }, +- { "gamma", 0, "g" }, +- { "ge", 0, ">=" }, +- { "gt", 0, ">" }, ++ { "gamma", 0, "g", 0x03b3}, ++ { "ge", 0, ">=", 0x2265}, ++ { "gt", 0, ">", 0x003e}, + { "hArr", 0, "<=>" }, + { "harr", 0, "<->" }, + // { "hearts", 0, "[hearts]" }, +- { "hellip", 0, "..." }, +- { "iacute", LATIN1_iacute, "i'" }, +- { "icirc", LATIN1_icirc, "i^" }, +- { "iexcl", LATIN1_iexcl, "!" }, +- { "igrave", LATIN1_igrave, "i`" }, ++ { "hellip", 0, "...", 0x2026}, ++ { "iacute", LATIN1_iacute, "i'", 0x00ed}, ++ { "icirc", LATIN1_icirc, "i^", 0x00ee}, ++ { "iexcl", LATIN1_iexcl, "!", 0x00a1}, ++ { "igrave", LATIN1_igrave, "i`", 0x00ec}, + { "image", 0, "Im" }, +- { "infin", 0, "oo" }, +- { "int", 0, "INT" }, +- { "iota", 0, "i" }, +- { "iquest", LATIN1_iquest, "?" }, ++ { "infin", 0, "oo", 0x221e}, ++ { "int", 0, "INT", 0x222b}, ++ { "iota", 0, "i", 0x03b9}, ++ { "iquest", LATIN1_iquest, "?", 0x00bf}, + { "isin", 0, "(-" }, +- { "iuml", LATIN1_iuml, "i\"" }, +- { "kappa", 0, "k" }, ++ { "iuml", LATIN1_iuml, "i\"", 0x00ef}, ++ { "kappa", 0, "k", 0x03ba}, + { "lArr", 0, "<=" }, +- { "lambda", 0, "l" }, ++ { "lambda", 0, "l", 0x03bb}, + { "lang", 0, "" }, + { "lsaquo", 0, "<" }, + { "lsquo", 0, "`" }, +- { "lt", 0, "<" }, +- { "macr", LATIN1_macr, "-" }, ++ { "lt", 0, "<", 0x003c}, ++ { "macr", LATIN1_macr, "-", 0x00af}, + { "mdash", 0, "--" }, +- { "micro", LATIN1_micro, "my" }, +- { "middot", LATIN1_middot, "." }, +- { "minus", 0, "-" }, +- { "mu", 0, "m" }, ++ { "micro", LATIN1_micro, "my", 0x00b5}, ++ { "middot", LATIN1_middot, ".", 0x00b7}, ++ { "minus", 0, "-", 0x2212}, ++ { "mu", 0, "m", 0x03bc}, + { "nabla", 0, "Nabla" }, +- { "nbsp", LATIN1_nbsp, " " }, ++ { "nbsp", LATIN1_nbsp, " ", 0x00a0}, + { "ndash", 0, "-" }, +- { "ne", 0, "!=" }, ++ { "ne", 0, "!=", 0x2260}, + { "ni", 0, "-)" }, + { "not", LATIN1_not, "NOT" }, + { "notin", 0, "!(-" }, + { "nsub", 0, "!(C" }, +- { "ntilde", LATIN1_ntilde, "n~" }, +- { "nu", 0, "n" }, +- { "oacute", LATIN1_oacute, "o'" }, +- { "ocirc", LATIN1_ocirc, "o^" }, ++ { "ntilde", LATIN1_ntilde, "n~", 0x00f1}, ++ { "nu", 0, "n", 0x03bd}, ++ { "oacute", LATIN1_oacute, "o'", 0x00f3}, ++ { "ocirc", LATIN1_ocirc, "o^", 0x00f4}, + { "oelig", 0, "oe" }, +- { "ograve", LATIN1_ograve, "o`" }, ++ { "ograve", LATIN1_ograve, "o`", 0x00f2}, + { "oline", LATIN1_macr, "-" }, +- { "omega", 0, "o" }, +- { "omicron", 0, "o" }, ++ { "omega", 0, "o", 0x03c9}, ++ { "omicron", 0, "o", 0x03bf}, + { "oplus", 0, "(+)" }, + { "or", 0, "OR" }, +- { "ordf", LATIN1_ordf, "-a" }, +- { "ordm", LATIN1_ordm, "-o" }, +- { "oslash", LATIN1_oslash, "o/" }, +- { "otilde", LATIN1_otilde, "o~" }, ++ { "ordf", LATIN1_ordf, "-a", 0x00aa}, ++ { "ordm", LATIN1_ordm, "-o", 0x00ba}, ++ { "oslash", LATIN1_oslash, "o/", 0x00f8}, ++ { "otilde", LATIN1_otilde, "o~", 0x00f5}, + { "otimes", 0, "(x)" }, +- { "ouml", LATIN1_ouml, "o\"" }, +- { "para", LATIN1_para, "P:" }, +- { "part", 0, "PART" }, +- { "permil", 0, " 0/00" }, ++ { "ouml", LATIN1_ouml, "o\"", 0x00f6}, ++ { "para", LATIN1_para, "P:", 0x00b6}, ++ { "part", 0, "PART",0x2202}, ++ { "permil", 0, " 0/00",0x2030}, + { "perp", 0, "-T" }, +- { "phi", 0, "f" }, +- { "pi", 0, "p" }, ++ { "phi", 0, "f", 0x03c6}, ++ { "pi", 0, "p", 0x03c0}, + { "piv", 0, "Pi" }, +- { "plusmn", LATIN1_plusmn, "+/-" }, +- { "pound", LATIN1_pound, "-L-" }, ++ { "plusmn", LATIN1_plusmn, "+/-", 0x00b1}, ++ { "pound", LATIN1_pound, "-L-", 0x00a3}, + { "prime", 0, "'" }, +- { "prod", 0, "PROD" }, ++ { "prod", 0, "PROD",0x220f}, + { "prop", 0, "0(" }, +- { "psi", 0, "ps" }, ++ { "psi", 0, "ps", 0x03c8}, + { "quot", 0, "\"" }, + { "rArr", 0, "=>" }, +- { "radic", 0, "SQRT" }, ++ { "radic", 0, "SQRT",0x221a}, + { "rang", 0, "/>" }, + { "raquo", LATIN1_raquo, ">>" }, +- { "rarr", 0, "->" }, ++ { "rarr", 0, "->", 0x2192}, + // { "rceil", 0, ">|" }, + { "rdquo", 0, "\"" }, + { "real", 0, "Re" }, +- { "reg", LATIN1_reg, "(R)" }, ++ { "reg", LATIN1_reg, "(R)", 0x00ae}, + // { "rfloor", 0, "|>" }, +- { "rho", 0, "r" }, ++ { "rho", 0, "r", 0x03c1}, + { "rsaquo", 0, ">" }, + { "rsquo", 0, "'" }, + { "sbquo", 0, "'" }, +- { "scaron", 0, "s" }, ++ { "scaron", 0, "s", 0x0161}, + { "sdot", 0, "DOT" }, +- { "sect", LATIN1_sect, "S:" }, ++ { "sect", LATIN1_sect, "S:", 0x00a7}, + { "shy", LATIN1_shy, "" }, +- { "sigma", 0, "s" }, +- { "sigmaf", 0, "s" }, ++ { "sigma", 0, "s", 0x03c3}, ++ { "sigmaf", 0, "s", 0x03c2}, + { "sim", 0, "~" }, + // { "spades", 0, "[spades]" }, + { "sub", 0, "(C" }, + { "sube", 0, "(_" }, +- { "sum", 0, "SUM" }, ++ { "sum", 0, "SUM", 0x2211}, + { "sup", 0, ")C" }, +- { "sup1", LATIN1_sup1, "^1" }, +- { "sup2", LATIN1_sup2, "^2" }, +- { "sup3", LATIN1_sup3, "^3" }, ++ { "sup1", LATIN1_sup1, "^1", 0x00b9}, ++ { "sup2", LATIN1_sup2, "^2", 0x00b2}, ++ { "sup3", LATIN1_sup3, "^3", 0x00b3}, + { "supe", 0, ")_" }, +- { "szlig", LATIN1_szlig, "ss" }, +- { "tau", 0, "t" }, ++ { "szlig", LATIN1_szlig, "ss", 0x00df}, ++ { "tau", 0, "t", 0x03c4}, + { "there4", 0, ".:" }, +- { "theta", 0, "th" }, +- { "thorn", LATIN1_thorn, "th" }, +- { "tilde", 0, "~" }, +- { "times", LATIN1_times, "x" }, +- { "trade", 0, "[TM]" }, ++ { "theta", 0, "th", 0x03b8}, ++ { "thorn", LATIN1_thorn, "th", 0x00fe}, ++ { "tilde", 0, "~", 0x02dc}, ++ { "times", LATIN1_times, "x", 0x00d7}, ++ { "trade", 0, "[TM]",0x2122}, + { "uArr", 0, "^^" }, +- { "uacute", LATIN1_uacute, "u'" }, ++ { "uacute", LATIN1_uacute, "u'", 0x00fa}, + { "uarr", 0, "^" }, +- { "ucirc", LATIN1_ucirc, "u^" }, +- { "ugrave", LATIN1_ugrave, "u`" }, +- { "uml", LATIN1_uml, "\"" }, +- { "upsilon", 0, "u" }, +- { "uuml", LATIN1_uuml, "u\"" }, ++ { "ucirc", LATIN1_ucirc, "u^", 0x00fb}, ++ { "ugrave", LATIN1_ugrave, "u`", 0x00f9}, ++ { "uml", LATIN1_uml, "\"", 0x00a8}, ++ { "upsilon", 0, "u", 0x03c5}, ++ { "uuml", LATIN1_uuml, "u\"", 0x00fc}, + { "weierp", 0, "P" }, +- { "xi", 0, "x" }, +- { "yacute", LATIN1_yacute, "y'" }, +- { "yen", LATIN1_yen, "YEN" }, +- { "yuml", LATIN1_yuml, "y\"" }, +- { "zeta", 0, "z" }, ++ { "xi", 0, "x", 0x03be}, ++ { "yacute", LATIN1_yacute, "y'", 0x00fd}, ++ { "yen", LATIN1_yen, "YEN", 0x00a5}, ++ { "yuml", LATIN1_yuml, "y\"", 0x00ff}, ++ { "zeta", 0, "z", 0x03b6}, + }; + +-extern int use_iso8859; ++extern int use_encoding; + + /* ------------------------------------------------------------------------- */ + ++char ubuf[4]; ++ ++char *mkutf(unsigned long x) ++{ ++ memset(ubuf, 0, 4); ++ if(x < 128) ubuf[0] = x; ++ else if(x < 0x800) { ++ ubuf[0] = (0xc0 | ((x >> 6) & 0x1f)); ++ ubuf[1] = (0x80 | (x & 0x3f)); ++ } ++ else { ++ ubuf[0] = (0xe0 | ((x >> 12) & 0x0f)); ++ ubuf[1] = (0x80 | ((x >> 6) & 0x3f)); ++ ubuf[2] = (0x80 | (x & 0x3f)); ++ } ++ return ubuf; ++} ++ + void + replace_sgml_entities(string *s) + { +@@ -330,9 +349,9 @@ + */ + while (j < l && s->at(j) != '&') ++j; + /* +- * We could convert high-bit chars to "é" here if use_iso8859 +- * is off, then let them be translated or not. Is the purpose of +- * !use_iso8859 to allow SGML entities to be seen, or to strongly ++ * We could convert high-bit chars to "é" here if USE_ASCII ++ * is on, then let them be translated or not. Is the purpose of ++ * USE_ASCII to allow SGML entities to be seen, or to strongly + * filter against high-ASCII chars that might blow up a terminal + * that doesn't speak ISO8859? For the moment, "allow SGML entities + * to be seen" -- no filtering here. +@@ -370,7 +389,11 @@ + if (!isdigit(c)) break; + x = 10 * x + c - '0'; + } +- if (use_iso8859 || (x < 128)) { ++ if (USE_UTF8) { ++ s->replace(beg, j - beg, mkutf(x)); ++ j = beg + 1; ++ } ++ else if (USE_ISO8859 && (x < 256) || USE_ASCII && (x < 128)) { + s->replace(beg, j - beg, 1, (char) x); + j = beg + 1; + } else { +@@ -408,13 +431,17 @@ + (int (*)(const void *, const void *)) strcmp + ); + if (entity != NULL) { +- if (use_iso8859 && entity->iso8859code) { ++ if (USE_ISO8859 && entity->iso8859code) { + s->replace(beg, j - beg, 1, (char) entity->iso8859code); + j = beg + 1; +- } else if (entity->asciistr) { ++ } else if (USE_ASCII && entity->asciistr) { + s->replace(beg, j - beg, entity->asciistr); + j = beg + 1; + } /* else don't replace it at all, we don't have a translation */ ++ else if(USE_UTF8 && entity->unicode) { ++ s->replace(beg, j - beg, mkutf(entity->unicode)); ++ j = beg + 1; ++ } + } + } else { + ; /* EXTENSION: Allow literal '&' sometimes. */ +diff -r -u -bB html2text-1.3.2a/table.C html2text-1.3.2a-patched/table.C +--- html2text-1.3.2a/table.C 2002-07-22 13:32:50.000000000 +0200 ++++ html2text-1.3.2a-patched/table.C 2005-05-13 22:19:59.871136320 +0200 +@@ -175,7 +175,7 @@ + - (*number_of_columns_return - 1) * (column_spacing + 0), + Area::LEFT // Yields better results than "p->halign"! + )); +- p->width = tmp.get() ? tmp->width() : 0; ++ p->width = tmp.get() ? tmp->utf_width() : 0; + } + p->minimized = false; + +@@ -308,7 +308,7 @@ + left_of_column + old_column_width - 1, + Area::LEFT // Yields better results than "lc.halign"! + )); +- w = tmp->width(); ++ w = tmp->utf_width(); + if (w >= left_of_column + old_column_width) lc.minimized = true; + } + if (w > left_of_column + new_column_width) {