Japanese docs HTML->TXT forbidden wordwraps; sync MAGIC-API

2023-07-17 23:52:23 -07:00 · 2023-07-17 23:52:23 -07:00 · 25a423f164
commit 25a423f164
parent bf1b90e279
21 changed files with 349 additions and 211 deletions
--- a/docs/nobr_forbidden.php
+++ b/docs/nobr_forbidden.php
@ -0,0 +1,112 @@
+#!/usr/bin/php
+<?php
+/* nobr_forbidden.php
+
+   A script to encase characters that are forbidden from
+   appearing at the beginning of a line (e.g., the
+   "。" full-stop), along with the previous character, inside
+   a "<nobr>...</nobr>", to prevent `w3m`'s word-wrapping
+   routine from doing that.
+
+   Bill Kendrick
+   2023-07-17 - 2023-07-17
+*/
+
+/* See https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages */
+
+/* Closing brackets (ignoring ' " ]) */
+$forbidden_start = ")｝〕〉》」』】〙〗〟｠»";
+
+/* Japanese characters: chiisai kana and special marks */
+$forbidden_start .= "ヽヾーァィゥェォッャュョヮヵヶぁぃぅぇぉっゃゅょゎゕゖㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ々〻";
+
+/* Hyphens */
+$forbidden_start .= "‐゠–〜";
+
+/* Delimiters */
+$forbidden_start .= "？!‼⁇⁈⁉";
+
+/* Mid-sentence punctuation */
+$forbidden_start .= "・、:;,";
+
+/* Sentence-ending punctuation */
+$forbidden_start .= "。\.";
+
+/* Opening brackets (ignoring ' " [) */
+$forbidden_end = "(｛〔〈《「『【〘〖〝｟«";
+
+
+/* FIXME: Would be better to use DOMDocument() and modify the
+   the text in the nodeValues, but the tuxpaint-docs HTML is
+   not currently XHTML compliant ;-( -bjk 2023.07.17
+
+   Something like this:
+
+     $dom = new DOMDocument();
+     libxml_use_internal_errors(false);
+     $dom->loadHTMLFile("php://stdin");
+     
+     $p = $dom->getElementsByTagName('p');
+     foreach ($p as $pnode) {
+       $nodeValue = $pnode->nodeValue;
+     
+       $nodeValue = preg_replace("/(.。)/", "<nobr>\\1<\/nobr>", $nodeValue);
+       $newNode = $dom->createElement("p", $nodeValue);
+       $pnode->parentNode->replaceChild($newNode, $pnode);
+     }
+     
+     echo $dom->saveHTML();
+
+   Instead, just reading the HTML file as a big text stream and
+   doing our best to only modify things that are not within the
+   HTML tags (esp. the <img> tags' "alt" attributes (aka "alt tags")).
+*/
+
+//setlocale(LC_ALL, "ja_JP.UTF-8");
+
+$fi = fopen("php://stdin", "r");
+
+$in_tag = false;
+
+while (!feof($fi)) {
+  $line = fgets($fi);
+
+  if (!feof($fi)) {
+    $newLine = "";
+    $text = "";
+
+    for ($i = 0; $i < strlen($line); $i++) {
+      $c = substr($line, $i, 1);
+
+      if ($c == "<") {
+        $in_tag = true;
+        $newLine .= replace_forbidden($text) . $c;
+        $text = "";
+      } else if ($c == ">") {
+        $in_tag = false;
+        $newLine .= $c;
+        $text = "";
+      } else if ($in_tag) {
+        $newLine .= $c;
+      } else {
+        $text .= $c;
+      }
+    }
+
+    $newLine .= replace_forbidden($text);
+    $text = "";
+
+    echo $newLine;
+  }
+}
+
+function replace_forbidden($str) {
+  global $forbidden_start, $forbidden_end;
+
+  $japanese = "\p{Katakana}\p{Hiragana}";
+
+  $str = preg_replace("/([$japanese][$forbidden_start])/u", "<nobr>\\1</nobr>", $str);
+  $str = preg_replace("/([$forbidden_end][$japanese])/u", "<nobr>\\1</nobr>", $str);
+  return $str;
+}
+