using MeCab;
using MeCab.Extension.UniDic;
using RomajiConverter.Core.Extensions;
using RomajiConverter.Core.Models;
using System;
using System.Collections.Generic;
using System.Collections.ObjectModel;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using RomajiConverter.Core.Options;
namespace RomajiConverter.Core.Helpers
{
public static class RomajiHelper
{
///
/// 分词器
///
private static MeCabTagger _tagger;
///
/// 自定义词典<原文, 假名>
///
private static Dictionary _customizeDict;
public static void Init(string baseDirectory = null)
{
string rootPath = !string.IsNullOrEmpty(baseDirectory)
? baseDirectory
: AppDomain.CurrentDomain.BaseDirectory;
//词典路径
var dicPath = Path.Combine(rootPath, "unidic");
var parameter = new MeCabParam
{
DicDir = dicPath,
LatticeLevel = MeCabLatticeLevel.Zero
};
_tagger = MeCabTagger.Create(parameter);
var str = File.ReadAllText(Path.Combine(rootPath, "customizeDict.txt"));
var list = str.Split(Environment.NewLine.ToArray());
_customizeDict = new Dictionary();
foreach (var item in list)
{
if (string.IsNullOrWhiteSpace(item)) continue;
var array = item.Split(' ');
if (array.Length < 2) continue;
if (!_customizeDict.ContainsKey(array[0]))
_customizeDict.Add(array[0], array[1]);
}
}
#region 主逻辑
///
/// 生成转换结果列表
///
///
///
///
public static IEnumerable ToRomaji(string text, ToRomajiOptions options = null)
{
options = options ?? new ToRomajiOptions();
var timeSpans = new List();
var lineTextList = text.Split(Environment.NewLine.ToArray()).Where(p => !string.IsNullOrWhiteSpace(p)).ToList();
for (var i = 0; i < lineTextList.Count; i++)
{
if (LrcParser.LrcLineRegex.IsMatch(lineTextList[i]))
{
var lyric = LrcParser.Parse(lineTextList[i]).FirstOrDefault();
timeSpans.Add(lyric.Time);
lineTextList[i] = lyric.Text;
}
else
{
timeSpans.Add(null);
}
}
ushort lineIndex = 0;
for (var index = 0; index < lineTextList.Count; index++)
{
var line = lineTextList[index];
if (IsChinese(line, options.ChineseRate)) continue;
var convertedLine = new ConvertedLine
{
Index = lineIndex,
Time = index < timeSpans.Count ? timeSpans[index] : null,
Japanese = line.Replace("\0", "")
};
foreach (var sentence in convertedLine.Japanese.LineToUnits())
{
if (IsEnglish(sentence))
{
convertedLine.Units.Add(new ConvertedUnit(lineIndex, sentence, sentence, sentence, false));
}
else
foreach (var unit in SentenceToRomaji(lineIndex, sentence, options.IsParticleAsPronunciation))
convertedLine.Units.Add(unit);
}
if (index + 1 < lineTextList.Count && IsChinese(lineTextList[index + 1], options.ChineseRate))
convertedLine.Chinese = lineTextList[index + 1];
lineIndex++;
yield return convertedLine;
}
}
///
/// 分句转为罗马音
///
///
///
///
///
public static IEnumerable SentenceToRomaji(ushort lineIndex, string str, bool isParticleAsPronunciation)
{
foreach (var item in _tagger.ParseToNodes(str))
{
var unit = MeCabNodeToUnit(lineIndex, item, isParticleAsPronunciation);
if (unit != null)
yield return unit;
}
}
public static ConvertedUnit MeCabNodeToUnit(ushort lineIndex, MeCabNode item, bool isParticleAsPronunciation)
{
ConvertedUnit unit = null;
if (item.CharType > 0)
{
var features = CustomSplit(item.Feature);
if (TryCustomConvert(item.Surface, out var customResult))
{
//用户自定义词典
unit = new ConvertedUnit(lineIndex,
item.Surface,
customResult,
KanaHelper.KatakanaToRomaji(customResult),
true);
}
else if (features.Length > 0 && (!isParticleAsPronunciation || item.GetPos1() != "助詞") && IsJapanese(item.Surface))
{
//纯假名
unit = new ConvertedUnit(lineIndex,
item.Surface,
KanaHelper.ToHiragana(item.Surface),
KanaHelper.KatakanaToRomaji(item.Surface),
false);
}
else if (features.Length <= 6 || new[] { "補助記号" }.Contains(item.GetPos1()))
{
//标点符号或无法识别的字
unit = new ConvertedUnit(lineIndex,
item.Surface,
item.Surface,
item.Surface,
false);
}
else if (IsEnglish(item.Surface))
{
//英文
unit = new ConvertedUnit(lineIndex,
item.Surface,
item.Surface,
item.Surface,
false);
}
else
{
//汉字或助词
var kana = GetKana(item);
unit = new ConvertedUnit(lineIndex,
item.Surface,
KanaHelper.ToHiragana(kana),
KanaHelper.KatakanaToRomaji(kana),
!IsJapanese(item.Surface));
var (replaceHiragana, replaceRomaji) = GetReplaceData(item);
unit.ReplaceHiragana = replaceHiragana;
unit.ReplaceRomaji = replaceRomaji;
}
}
else if (item.Stat != MeCabNodeStat.Bos && item.Stat != MeCabNodeStat.Eos)
{
unit = new ConvertedUnit(lineIndex,
item.Surface,
item.Surface,
item.Surface,
false);
}
return unit;
}
#endregion
#region 帮助方法
///
/// 自定义分隔方法(Feature可能存在如 a,b,c,"d,e",f 格式的数据,此处不能把双引号中的内容也分隔开)
///
///
///
private static string[] CustomSplit(string str)
{
var list = new List();
var item = new List();
var haveMark = false;
foreach (var c in str)
if (c == ',' && !haveMark)
{
list.Add(new string(item.ToArray()));
item.Clear();
}
else if (c == '"')
{
item.Add(c);
haveMark = !haveMark;
}
else
{
item.Add(c);
}
return list.ToArray();
}
///
/// 获取所有发音
///
///
///
private static (ObservableCollection replaceHiragana, ObservableCollection
replaceRomaji) GetReplaceData(MeCabNode node)
{
var length = node.Length;
var replaceNodeList = new List();
GetAllReplaceNode(replaceNodeList, node);
void GetAllReplaceNode(List list, MeCabNode n)
{
if (n != null && !list.Contains(n) && n.Length == length)
{
list.Add(n);
GetAllReplaceNode(list, n.BNext);
GetAllReplaceNode(list, n.ENext);
}
}
var replaceHiragana = new ObservableCollection();
var replaceRomaji = new ObservableCollection();
ushort i = 1;
foreach (var meCabNode in replaceNodeList
.GroupBy(GetKana)
.Select(g => g.First()))
{
var kana = GetKana(meCabNode);
if (kana != null)
{
replaceHiragana.Add(new ReplaceString(i, KanaHelper.ToHiragana(kana), true));
replaceRomaji.Add(new ReplaceString(i, KanaHelper.KatakanaToRomaji(kana), true));
i++;
}
}
return (replaceHiragana, replaceRomaji);
}
private static string GetKana(MeCabNode node)
{
return node.GetPos1() == "助詞" ? node.GetPron() : node.GetKana();
}
///
/// 自定义转换规则
///
///
///
///
private static bool TryCustomConvert(string str, out string result)
{
if (_customizeDict.ContainsKey(str))
{
result = _customizeDict[str];
return true;
}
result = "";
return false;
}
///
/// 判断字符串(句子)是否简体中文
///
///
/// 容错率(0-1)
///
public static bool IsChinese(string str, float rate)
{
if (str.Length < 2)
return false;
var wordArray = str.ToCharArray();
var total = wordArray.Length;
var chCount = 0f;
var enCount = 0f;
foreach (var word in wordArray)
{
if (word != 'ー' && IsJapanese(word.ToString()))
//含有日文直接返回否
return false;
var gbBytes = Encoding.Unicode.GetBytes(word.ToString());
if (gbBytes.Length == 2) // double bytes char.
{
if (gbBytes[1] >= 0x4E && gbBytes[1] <= 0x9F) //中文
chCount++;
else
total--;
}
else if (gbBytes.Length == 1)
{
var byteAscii = int.Parse(gbBytes[0].ToString());
if ((byteAscii >= 65 && byteAscii <= 90) || (byteAscii >= 97 && byteAscii <= 122)) //英文字母
enCount++;
else
total--;
}
}
if (chCount == 0) return false; //一个简体中文都没有
return (chCount + enCount) / total >= rate;
}
///
/// 判断字符串是否全为单字节
///
///
///
public static bool IsEnglish(string str)
{
return new Regex("^[\x20-\x7E]+$", RegexOptions.Compiled).IsMatch(str);
}
///
/// 判断字符串是否全为假名
///
///
///
private static bool IsJapanese(string str)
{
return Regex.IsMatch(str, @"^[\u3040-\u30ff]+$", RegexOptions.Compiled);
}
#endregion
}
}