The encoding is using Big Endian Unicode:
return Encoding.BigEndianUnicode.GetString(bytes);
Tested with your string and with a test case from http://www.steptools.com/stds/step/IS_final_p21e3.html
Note that the whole IFC format is quite complex. It would take me at least 2-4 hours to write a full decoder supporting the various S(something)
, P(something)
, X2(hex)
, X4(hex)
, X(hex)
(plus the ending X0
). There is even a problem in the documentation about the X4
examples (that are given with 7 hex digits instead of 8 hex digits), and it seems that the whole file should be UTF-8 encoded outside the escape sequences.
Aaaaand done:
Some tests:
// With .NET Core/.NET 5.0 you'll need the nuget
// https://www.nuget.org/packages/System.Text.Encoding.CodePages/
// And this line
//Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
// Nothing is needed with .NET Framework
string strExampleUnquoted = ItfStringDecoder.DecodeItfString(@"X2420043504310440043EX0");
string str1Unquoted = ItfStringDecoder.DecodeItfString(@"CAT");
string str2Unquoted = ItfStringDecoder.DecodeItfString(@"Don''t");
string str3Unquoted = ItfStringDecoder.DecodeItfString(@"''");
string str4Unquoted = ItfStringDecoder.DecodeItfString(@"");
string str5Unquoted = ItfStringDecoder.DecodeItfString(@"SDrger");
string str6Unquoted = ItfStringDecoder.DecodeItfString(@"hStel");
string str7Unquoted = ItfStringDecoder.DecodeItfString(@"PE\S*SUS");
string str8Unquoted = ItfStringDecoder.DecodeItfString(@"X23C0X0");
string str9Unquoted = ItfStringDecoder.DecodeItfString(@"X23B103B203B3X0");
string str10Unquoted = ItfStringDecoder.DecodeItfString(@"X4001F600X0");
string str11Unquoted = ItfStringDecoder.DecodeItfString(@"X4001F6000001F638X0");
string str12Unquoted = ItfStringDecoder.DecodeItfString(@"see XA7 4.1");
string str13Unquoted = ItfStringDecoder.DecodeItfString(@"line oneXAline two");
string str1Quoted = ItfStringDecoder.DecodeItfString(@"'CAT'", true);
string str2Quoted = ItfStringDecoder.DecodeItfString(@"'Don''t'", true);
string str3Quoted = ItfStringDecoder.DecodeItfString(@"''''", true);
string str4Quoted = ItfStringDecoder.DecodeItfString(@"''", true);
string str5Quoted = ItfStringDecoder.DecodeItfString(@"'SDrger'", true);
string str6Quoted = ItfStringDecoder.DecodeItfString(@"'hStel'", true);
string str7Quoted = ItfStringDecoder.DecodeItfString(@"'PE\S*SUS'", true);
string str8Quoted = ItfStringDecoder.DecodeItfString(@"'X23C0X0'", true);
string str9Quoted = ItfStringDecoder.DecodeItfString(@"'X23B103B203B3X0'", true);
string str10Quoted = ItfStringDecoder.DecodeItfString(@"'X4001F600X0'", true);
string str11Quoted = ItfStringDecoder.DecodeItfString(@"'X4001F6000001F638X0'", true);
string str12Quoted = ItfStringDecoder.DecodeItfString(@"'see XA7 4.1'", true);
string str13Quoted = ItfStringDecoder.DecodeItfString(@"'line oneXAline two'", true);
And the decoder:
/// <summary>
///
/// </summary>
/// <param name="bytes"></param>
/// <param name="quoted">true = 'XYZ', false = XYZ</param>
/// <returns></returns>
public static string DecodeItfString(byte[] bytes, bool quoted = false)
{
return DecodeItfString(Encoding.UTF8.GetString(bytes), quoted);
}
/// <summary>
///
/// </summary>
/// <param name="str"></param>
/// <param name="quoted">true = 'XYZ', false = XYZ</param>
/// <returns></returns>
public static string DecodeItfString(string str, bool quoted = false)
{
// We start with iso-8859-1 that is null
Encoding encoding = null;
int start = 0;
int end = str.Length - 1;
if (quoted)
{
if (!str.StartsWith('''))
{
throw new FormatException("Malformed string, non starting with "'"");
}
if (!str.EndsWith('''))
{
throw new FormatException("Malformed string, non ending with "'"");
}
start = 1;
end = str.Length - 2;
}
var sb = new StringBuilder();
for (int i = start; i <= end; i++)
{
char ch0 = str[i];
if (ch0 == ''')
{
if (i + 1 > end || str[i + 1] != ''')
{
throw new FormatException($"Malformed string, "'" not followed by "'" at position {i}");
}
sb.Append(''');
i++;
}
else if (ch0 == '\')
{
if (i + 1 > end)
{
throw new FormatException($"Malformed string, "\" not followed by legal character at position {i}");
}
char ch1 = str[i + 1];
switch (ch1)
{
case '\':
sb.Append('\');
i++;
break;
case 'S':
i += DecodeItfStringPage(str, i, end, sb, encoding);
break;
case 'P':
i += DecodeItfStringAlphabet(str, i, end, out encoding);
break;
case 'X':
i += DecodeItfStringExtendedOrArbitary(str, i, end, sb);
break;
default:
throw new FormatException($"Malformed string, "\" followed by illegal character at position {i}");
}
}
else
{
sb.Append(ch0);
}
}
return sb.ToString();
}
private static int DecodeItfStringPage(string str, int i, int end, StringBuilder sb, Encoding encoding)
{
if (i + 3 > end || str[i + 2] != '\')
{
throw new FormatException($"Malformed string, "\S" not followed by legal character at position {i}");
}
char ch3 = str[i + 3];
// Latin codepoint
if (ch3 == ' ' ||
(ch3 >= '0' && ch3 <= '9') ||
(ch3 >= 'a' && ch3 <= 'z') ||
(ch3 >= 'A' && ch3 <= 'Z') ||
ch3 == '_' ||
ch3 == '!' || ch3 == '"' || ch3 == '*' || ch3 == '$' || ch3 == '%' || ch3 == '&' || ch3 == '.' || ch3 == '#' ||
ch3 == '+' || ch3 == ',' || ch3 == '-' || ch3 == '(' || ch3 == ')' || ch3 == '?' || ch3 == '/' || ch3 == ':' ||
ch3 == ';' || ch3 == '<' || ch3 == '=' || ch3 == '>' || ch3 == '@' || ch3 == '[' || ch3 == ']' || ch3 == '{' ||
ch3 == '|' || ch3 == '}' || ch3 == '^' || ch3 == '`' || ch3 == '~' ||
ch3 == '\' || ch3 == ''')
{
// ok
}
else
{
throw new FormatException($"Malformed string, "\S" not followed by legal character at position {i}");
}
// Little cheat for iso-8859-1
if (encoding == null)
{
// The iso-8859-1 encoding maps 1:1 with the first 256 unicode codepoints
sb.Append((char)(ch3 + 128));
}
else
{
// Without array allocation (this is allocated on the stack)
ReadOnlySpan<byte> bytes = stackalloc byte[] { (byte)(ch3 + 128) };
// Classic with array
//var bytes = new byte[] { (byte)(ch3 + 128) };
sb.Append(encoding.GetString(bytes));
}
return 3;
}
private static int DecodeItfStringAlphabet(string str, int i, int end, out Encoding encoding)
{
if (i + 3 > end || str[i + 3] != '\')
{
throw new FormatException($"Malformed string, "\P" not followed by legal character at position {i}");
}
char ch2 = str[i + 2];
if (ch2 < 'A' || ch2 > 'I')
{
throw new FormatException($"Malformed string, "\P" not followed by legal character at position {i}");
}
int ix = ch2 - 'A';
// We don't need an encoder for iso-8859-1
// and 28591 is iso-8859-1, 28592 is iso-8859-2...
encoding = ix == 0 ? null : Encoding.GetEncoding(28591 + ix);
return 3;
}
private static int DecodeItfStringExtendedOrArbitary(string str, int i, int end, StringBuilder sb)
{
if (i + 4 > end)
{
throw new FormatException($"Malformed string, "\X" not followed by legal character at position {i}");
}
char ch2 = str[i + 2];
if (ch2 == '\')
{
byte b1, b2;
if (!TryFromHex(str[i + 3], out b1) || !TryFromHex(str[i + 4], out b2))
{
throw new FormatException($"Malformed string, "\X\" not followed by legal character at position {i}");
}
byte b = (byte)(b1 * 16 + b2);
sb.Append((char)b);
return 4;
}
if (ch2 == '2')
{
if (str[i + 3] != '\')
{
throw new FormatException($"Malformed string, "\X2" not followed by legal character at position {i}");
}
int j = i + 4;
while (true)
{
if (j + 3 > end)
{
throw new FormatException($"Malformed string, "\X2" not followed by legal sequence of characters at position {j}");
}
byte b1, b2, b3, b4;
if (!TryFromHex(str[j], out b1) || !TryFromHex(str[j + 1], out b2) ||
!TryFromHex(str[j + 2], out b3) || !TryFromHex(str[j + 3], out b4))
{
throw new FormatException($"Malformed string, "\X2\" not followed by legal character at position {j}");
}
char ch = (char)(b1 << 12 | b2 << 8 | b3 << 4 | b4);
sb.Append(ch);
j += 4;
if (j + 3 > end)
{
throw new FormatException($"Malformed string, "\X2" not followed by legal sequence of characters at position {j}");
}
if (str[j] == '\')
{
if (str[j + 1] == 'X' && str[j + 2] == '0' && str[j + 3] == '\')
{
j += 3;
return j - i;
}
throw new FormatException($"Malformed string, "\X2" not followed by legal sequence of characters at position {j}");
}
}
}
if