Saturday, October 11, 2025

Reversing a Odia Unicode String

Odia is a language with lot of combining characters. Therefore simple reversal of string like str.split('').reverse().join('') will not work . With this naive approach, the reverse of string like ଜଳାର୍ଣ୍ଣବ will end up in ବଣ୍ଣ୍ରାଳଜ where as the expected reversal would be ବର୍ଣ୍ଣଳାଜ. 

Even I failed with BreakIterator of Java with Odia Locale. I also tried packages like esrever but failed. Finally I tried with basic decomposition of Unicode Odia syllables. A string like ଜଳାର୍ଣ୍ଣବ is composed of characters like [  'ଜ', 'ଳ', 'ା', 'ର',  '୍',  'ଣ', '୍',  'ଣ',  'ବ' ], therefore handling the matras and falas did the job. Here is the code in JavaScript: 

/*
 * Task: Find the length of an Odia word. Ex. length of ଦୁର୍ଯ୍ୟୋଧନ should be 4
 * Author: Dr. Swarupananda Bissoyi (swarupananda@gmail.com)
 * Original Version written on: October 7, 2022
 * Bug fixes and final version on: October 12, 2025
 */
function strlen(text) {
    let len = 0;
    let syllable = '';
    let reverseStr = '';
    const maatraas = 'ାିୀୁୂୃେୈୋୌଂଁଃ';
   
    for (let index = 0; index < text.length; index++) {
        const ch = text[index];
        const nextChar = index < text.length - 1 ? text[index + 1] : '\0';
        const nextToNextChar = index < text.length - 2 ? text[index + 2] : '\0';
       
        // Append current character to syllable
        syllable += ch;

        // Check if the next character is a maatraa
        if (maatraas.includes(nextChar)) {
            syllable += nextChar;
            index++;
            // Special case: multiple maatraas, e.g., କିଁଆ
            if (maatraas.includes(nextToNextChar)) {
                syllable += nextToNextChar;
                index++;
            }
            isEnd = true;
        } else if (nextChar === '\u200D' && nextToNextChar === '\u0B4D') {
            // Handle Zero Width Non-Joiner (ZWNJ) followed by \u0B4D
            // Case of ର‍୍ୟାକେଟ୍‌
            syllable += nextChar;
            index++;
            syllable += nextToNextChar;
            index++;
            isEnd = false;
        } else if (nextChar === '\u0B4D') {
            // Handle consonant conjunction
            syllable += nextChar;
            index++;
            // Handle cases where \u0B4D is the last character, e.g., ସ୍ବରୁପ୍
            if (nextToNextChar === '\0') {
                syllable += '\u200C'; // Add Zero Width Joiner
                isEnd = true;
            } else {
                isEnd = false;
            }
        } else {
            isEnd = true;
        }

        if (isEnd) {
            reverseStr = syllable + reverseStr;
            syllable = '';
            len++;
        }
    }

    console.log(`Reverse of [${text}] is [${reverseStr}]`);
    return len;
}

// Example usage
const strs = [
    "ଓଁ", "ଉହୁଁ", "କିଁଆ", "ସ୍ୱରୁପ", "ଅପ୍", "ବ୍ଲିଂ",
    "ସ୍ବରୁପ୍",
    "ଦୁଃଖ", "ର୍ୟାକେଟ୍‌", "ର‍୍ୟାକେଟ୍‌",
    "ସଂସ୍କୃଃ", "ଲେନ୍ସ", "ଲେନ୍‌ସ୍‌",
    "ପୁନର୍ଜ୍ଜୀବିତ",
    "କ୍ଲୀଂହ୍ଲୀ", "ରୁକ୍ମିଣୀ", "ଯାଚ୍ଞ।", "କେଁ କଟର୍", "ହଠାତ୍‌", "ନ୍ଦୁନ୍ଦୁଭି",
    "ଭର୍ତ୍ତୃହରି", "ପରାଙ୍‌ମୁଖ", "ସଂସ୍ଥାପକ", "ଦୁର୍ଯ୍ୟୋଧନ", "ଅର୍ଜ୍ଜୁନ",
    "ଜଳାର୍ଣ୍ଣବ", "କଟକ", "ଅସତ୍‌କର୍ମ", "ନୃପସ୍ଥାୟକ",
    "ଘୂର୍ଣ୍ଣିବାତ୍ୟା"
];

for (const str of strs) {
    const len = strlen(str);
    console.log(`Length of ${str} : ${len}`);
}

No comments:

Post a Comment