Japanese docs HTML->TXT forbidden wordwraps; sync MAGIC-API
This commit is contained in:
parent
bf1b90e279
commit
25a423f164
21 changed files with 349 additions and 211 deletions
112
docs/nobr_forbidden.php
Executable file
112
docs/nobr_forbidden.php
Executable file
|
|
@ -0,0 +1,112 @@
|
|||
#!/usr/bin/php
|
||||
<?php
|
||||
/* nobr_forbidden.php
|
||||
|
||||
A script to encase characters that are forbidden from
|
||||
appearing at the beginning of a line (e.g., the
|
||||
"。" full-stop), along with the previous character, inside
|
||||
a "<nobr>...</nobr>", to prevent `w3m`'s word-wrapping
|
||||
routine from doing that.
|
||||
|
||||
Bill Kendrick
|
||||
2023-07-17 - 2023-07-17
|
||||
*/
|
||||
|
||||
/* See https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages */
|
||||
|
||||
/* Closing brackets (ignoring ' " ]) */
|
||||
$forbidden_start = ")}〕〉》」』】〙〗〟⦆»";
|
||||
|
||||
/* Japanese characters: chiisai kana and special marks */
|
||||
$forbidden_start .= "ヽヾーァィゥェォッャュョヮヵヶぁぃぅぇぉっゃゅょゎゕゖㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ々〻";
|
||||
|
||||
/* Hyphens */
|
||||
$forbidden_start .= "‐゠–〜";
|
||||
|
||||
/* Delimiters */
|
||||
$forbidden_start .= "?!‼⁇⁈⁉";
|
||||
|
||||
/* Mid-sentence punctuation */
|
||||
$forbidden_start .= "・、:;,";
|
||||
|
||||
/* Sentence-ending punctuation */
|
||||
$forbidden_start .= "。\.";
|
||||
|
||||
/* Opening brackets (ignoring ' " [) */
|
||||
$forbidden_end = "({〔〈《「『【〘〖〝⦅«";
|
||||
|
||||
|
||||
/* FIXME: Would be better to use DOMDocument() and modify the
|
||||
the text in the nodeValues, but the tuxpaint-docs HTML is
|
||||
not currently XHTML compliant ;-( -bjk 2023.07.17
|
||||
|
||||
Something like this:
|
||||
|
||||
$dom = new DOMDocument();
|
||||
libxml_use_internal_errors(false);
|
||||
$dom->loadHTMLFile("php://stdin");
|
||||
|
||||
$p = $dom->getElementsByTagName('p');
|
||||
foreach ($p as $pnode) {
|
||||
$nodeValue = $pnode->nodeValue;
|
||||
|
||||
$nodeValue = preg_replace("/(.。)/", "<nobr>\\1<\/nobr>", $nodeValue);
|
||||
$newNode = $dom->createElement("p", $nodeValue);
|
||||
$pnode->parentNode->replaceChild($newNode, $pnode);
|
||||
}
|
||||
|
||||
echo $dom->saveHTML();
|
||||
|
||||
Instead, just reading the HTML file as a big text stream and
|
||||
doing our best to only modify things that are not within the
|
||||
HTML tags (esp. the <img> tags' "alt" attributes (aka "alt tags")).
|
||||
*/
|
||||
|
||||
//setlocale(LC_ALL, "ja_JP.UTF-8");
|
||||
|
||||
$fi = fopen("php://stdin", "r");
|
||||
|
||||
$in_tag = false;
|
||||
|
||||
while (!feof($fi)) {
|
||||
$line = fgets($fi);
|
||||
|
||||
if (!feof($fi)) {
|
||||
$newLine = "";
|
||||
$text = "";
|
||||
|
||||
for ($i = 0; $i < strlen($line); $i++) {
|
||||
$c = substr($line, $i, 1);
|
||||
|
||||
if ($c == "<") {
|
||||
$in_tag = true;
|
||||
$newLine .= replace_forbidden($text) . $c;
|
||||
$text = "";
|
||||
} else if ($c == ">") {
|
||||
$in_tag = false;
|
||||
$newLine .= $c;
|
||||
$text = "";
|
||||
} else if ($in_tag) {
|
||||
$newLine .= $c;
|
||||
} else {
|
||||
$text .= $c;
|
||||
}
|
||||
}
|
||||
|
||||
$newLine .= replace_forbidden($text);
|
||||
$text = "";
|
||||
|
||||
echo $newLine;
|
||||
}
|
||||
}
|
||||
|
||||
function replace_forbidden($str) {
|
||||
global $forbidden_start, $forbidden_end;
|
||||
|
||||
$japanese = "\p{Katakana}\p{Hiragana}";
|
||||
|
||||
$str = preg_replace("/([$japanese][$forbidden_start])/u", "<nobr>\\1</nobr>", $str);
|
||||
$str = preg_replace("/([$forbidden_end][$japanese])/u", "<nobr>\\1</nobr>", $str);
|
||||
return $str;
|
||||
}
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue