Japanese docs HTML->TXT forbidden wordwraps; sync MAGIC-API

This commit is contained in:
Bill Kendrick 2023-07-17 23:52:23 -07:00
parent bf1b90e279
commit 25a423f164
21 changed files with 349 additions and 211 deletions

112
docs/nobr_forbidden.php Executable file
View file

@ -0,0 +1,112 @@
#!/usr/bin/php
<?php
/* nobr_forbidden.php
A script to encase characters that are forbidden from
appearing at the beginning of a line (e.g., the
"" full-stop), along with the previous character, inside
a "<nobr>...</nobr>", to prevent `w3m`'s word-wrapping
routine from doing that.
Bill Kendrick
2023-07-17 - 2023-07-17
*/
/* See https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages */
/* Closing brackets (ignoring ' " ]) */
$forbidden_start = ")}〕〉》」』】〙〗〟⦆»";
/* Japanese characters: chiisai kana and special marks */
$forbidden_start .= "ヽヾーァィゥェォッャュョヮヵヶぁぃぅぇぉっゃゅょゎゕゖㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ々〻";
/* Hyphens */
$forbidden_start .= "‐゠–〜";
/* Delimiters */
$forbidden_start .= "!‼⁇⁈⁉";
/* Mid-sentence punctuation */
$forbidden_start .= "・、:;,";
/* Sentence-ending punctuation */
$forbidden_start .= "\.";
/* Opening brackets (ignoring ' " [) */
$forbidden_end = "({〔〈《「『【〘〖〝⦅«";
/* FIXME: Would be better to use DOMDocument() and modify the
the text in the nodeValues, but the tuxpaint-docs HTML is
not currently XHTML compliant ;-( -bjk 2023.07.17
Something like this:
$dom = new DOMDocument();
libxml_use_internal_errors(false);
$dom->loadHTMLFile("php://stdin");
$p = $dom->getElementsByTagName('p');
foreach ($p as $pnode) {
$nodeValue = $pnode->nodeValue;
$nodeValue = preg_replace("/(.。)/", "<nobr>\\1<\/nobr>", $nodeValue);
$newNode = $dom->createElement("p", $nodeValue);
$pnode->parentNode->replaceChild($newNode, $pnode);
}
echo $dom->saveHTML();
Instead, just reading the HTML file as a big text stream and
doing our best to only modify things that are not within the
HTML tags (esp. the <img> tags' "alt" attributes (aka "alt tags")).
*/
//setlocale(LC_ALL, "ja_JP.UTF-8");
$fi = fopen("php://stdin", "r");
$in_tag = false;
while (!feof($fi)) {
$line = fgets($fi);
if (!feof($fi)) {
$newLine = "";
$text = "";
for ($i = 0; $i < strlen($line); $i++) {
$c = substr($line, $i, 1);
if ($c == "<") {
$in_tag = true;
$newLine .= replace_forbidden($text) . $c;
$text = "";
} else if ($c == ">") {
$in_tag = false;
$newLine .= $c;
$text = "";
} else if ($in_tag) {
$newLine .= $c;
} else {
$text .= $c;
}
}
$newLine .= replace_forbidden($text);
$text = "";
echo $newLine;
}
}
function replace_forbidden($str) {
global $forbidden_start, $forbidden_end;
$japanese = "\p{Katakana}\p{Hiragana}";
$str = preg_replace("/([$japanese][$forbidden_start])/u", "<nobr>\\1</nobr>", $str);
$str = preg_replace("/([$forbidden_end][$japanese])/u", "<nobr>\\1</nobr>", $str);
return $str;
}