概要
- Shift_JIS 範囲内の文字が Unicode のどの文字クラスに当たるのかを調べリストアップする。
ソース
- ListupShiftJISClass.zip
- Program.cs
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Text.RegularExpressions; namespace ListupShiftJISClass { class Program { /// <summary> /// .NET Framework の文字クラス名 /// http://msdn.microsoft.com/ja-jp/library/20bw873z /// </summary> static string[] ClassNames ={ "BasicLatin", "Latin-1Supplement", "LatinExtended-A", "LatinExtended-B", "IPAExtensions", "SpacingModifierLetters", "CombiningDiacriticalMarks", "Greek", "GreekandCoptic", "Cyrillic", "CyrillicSupplement", "Armenian", "Hebrew", "Arabic", "Syriac", "Thaana", "Devanagari", "Bengali", "Gurmukhi", "Gujarati", "Oriya", "Tamil", "Telugu", "Kannada", "Malayalam", "Sinhala", "Thai", "Lao", "Tibetan", "Myanmar", "Georgian", "HangulJamo", "Ethiopic", "Cherokee", "UnifiedCanadianAboriginalSyllabics", "Ogham", "Runic", "Tagalog", "Hanunoo", "Buhid", "Tagbanwa", "Khmer", "Mongolian", "Limbu", "TaiLe", "KhmerSymbols", "PhoneticExtensions", "LatinExtendedAdditional", "GreekExtended", "GeneralPunctuation", "SuperscriptsandSubscripts", "CurrencySymbols", "CombiningDiacriticalMarksforSymbols", "CombiningMarksforSymbols", "LetterlikeSymbols", "NumberForms", "Arrows", "MathematicalOperators", "MiscellaneousTechnical", "ControlPictures", "OpticalCharacterRecognition", "EnclosedAlphanumerics", "BoxDrawing", "BlockElements", "GeometricShapes", "MiscellaneousSymbols", "Dingbats", "MiscellaneousMathematicalSymbols-A", "SupplementalArrows-A", "BraillePatterns", "SupplementalArrows-B", "MiscellaneousMathematicalSymbols-B", "SupplementalMathematicalOperators", "MiscellaneousSymbolsandArrows", "CJKRadicalsSupplement", "KangxiRadicals", "IdeographicDescriptionCharacters", "CJKSymbolsandPunctuation", "Hiragana", "Katakana", "Bopomofo", "HangulCompatibilityJamo", "Kanbun", "BopomofoExtended", "KatakanaPhoneticExtensions", "EnclosedCJKLettersandMonths", "CJKCompatibility", "CJKUnifiedIdeographsExtensionA", "YijingHexagramSymbols", "CJKUnifiedIdeographs", "YiSyllables", "YiRadicals", "HangulSyllables", "HighSurrogates", "HighPrivateUseSurrogates", "LowSurrogates", "PrivateUse", "PrivateUseArea", "CJKCompatibilityIdeographs", "AlphabeticPresentationForms", "ArabicPresentationForms-A", "VariationSelectors", "CombiningHalfMarks", "CJKCompatibilityForms", "SmallFormVariants", "ArabicPresentationForms-B", "HalfwidthandFullwidthForms", "Specials", }; static Dictionary<string, Regex> regCharClasses = new Dictionary<string, Regex>(); static void Main(string[] args) { foreach (string classname in ClassNames) { regCharClasses[classname] = new Regex(@"\p{Is" + classname + "}"); } Dictionary<string, string> result = new Dictionary<string, string>(); for (int code = 0x00; code <= 0xffff; ++code) { string c = ((char)code).ToString(); if (IsShiftJIS(c)) { string classname = getClassName(c); result[classname] = (result.ContainsKey(classname)) ? result[classname] + c : c; } } foreach (string classname in result.Keys) { Console.WriteLine("{0}", classname); //Console.WriteLine("{0}\n{1}\n", classname, result[classname]); } } /// <summary> /// Shift_JIS 範囲内の文字のみかどうかをチェックして、その結果を返す。 /// </summary> /// <remarks> /// http://acha-ya.cocolog-nifty.com/blog/2010/12/unicode-ef79.html /// </remarks> /// <param name="checkString">チェック対象の文字列</param> /// <returns> /// <list type="bullet"> /// <item>true: 全ての文字は Shift_JIS 範囲内である</item> /// <item>false: Shift_JIS 範囲外の文字が含まれている</item> /// </list> /// </returns> static bool IsShiftJIS(string checkString) { byte[] translateBuffer = Encoding.GetEncoding("shift_jis").GetBytes(checkString); string translateString = Encoding.GetEncoding("shift_jis").GetString(translateBuffer); return (checkString == translateString.ToString()); } static string getClassName(string c) { string result = "-"; // Not found foreach (string classname in ClassNames) { Match m = regCharClasses[classname].Match(c); if (m.Success) { result = classname; break; } } return result; } } }
出力
BasicLatin
Latin-1Supplement
Greek
Cyrillic
GeneralPunctuation
LetterlikeSymbols
NumberForms
Arrows
MathematicalOperators
MiscellaneousTechnical
EnclosedAlphanumerics
BoxDrawing
GeometricShapes
MiscellaneousSymbols
CJKSymbolsandPunctuation
Hiragana
Katakana
EnclosedCJKLettersandMonths
CJKCompatibility
CJKUnifiedIdeographs
PrivateUse
CJKCompatibilityIdeographs
HalfwidthandFullwidthForms