GTMNSString+HTML.m 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522
  1. //
  2. // GTMNSString+HTML.m
  3. // Dealing with NSStrings that contain HTML
  4. //
  5. // Copyright 2006-2008 Google Inc.
  6. //
  7. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  8. // use this file except in compliance with the License. You may obtain a copy
  9. // of the License at
  10. //
  11. // http://www.apache.org/licenses/LICENSE-2.0
  12. //
  13. // Unless required by applicable law or agreed to in writing, software
  14. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  15. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  16. // License for the specific language governing permissions and limitations under
  17. // the License.
  18. //
  19. #import "GTMDefines.h"
  20. #import "GTMNSString+HTML.h"
  21. typedef struct {
  22. NSString *escapeSequence;
  23. unichar uchar;
  24. } HTMLEscapeMap;
  25. // Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters
  26. // Ordered by uchar lowest to highest for bsearching
  27. static HTMLEscapeMap gAsciiHTMLEscapeMap[] = {
  28. // A.2.2. Special characters
  29. { @""", 34 },
  30. { @"&", 38 },
  31. { @"'", 39 },
  32. { @"<", 60 },
  33. { @">", 62 },
  34. // A.2.1. Latin-1 characters
  35. { @" ", 160 },
  36. { @"¡", 161 },
  37. { @"¢", 162 },
  38. { @"£", 163 },
  39. { @"¤", 164 },
  40. { @"¥", 165 },
  41. { @"¦", 166 },
  42. { @"§", 167 },
  43. { @"¨", 168 },
  44. { @"©", 169 },
  45. { @"ª", 170 },
  46. { @"«", 171 },
  47. { @"¬", 172 },
  48. { @"­", 173 },
  49. { @"®", 174 },
  50. { @"¯", 175 },
  51. { @"°", 176 },
  52. { @"±", 177 },
  53. { @"²", 178 },
  54. { @"³", 179 },
  55. { @"´", 180 },
  56. { @"µ", 181 },
  57. { @"¶", 182 },
  58. { @"·", 183 },
  59. { @"¸", 184 },
  60. { @"¹", 185 },
  61. { @"º", 186 },
  62. { @"»", 187 },
  63. { @"¼", 188 },
  64. { @"½", 189 },
  65. { @"¾", 190 },
  66. { @"¿", 191 },
  67. { @"À", 192 },
  68. { @"Á", 193 },
  69. { @"Â", 194 },
  70. { @"Ã", 195 },
  71. { @"Ä", 196 },
  72. { @"Å", 197 },
  73. { @"Æ", 198 },
  74. { @"Ç", 199 },
  75. { @"È", 200 },
  76. { @"É", 201 },
  77. { @"Ê", 202 },
  78. { @"Ë", 203 },
  79. { @"Ì", 204 },
  80. { @"Í", 205 },
  81. { @"Î", 206 },
  82. { @"Ï", 207 },
  83. { @"Ð", 208 },
  84. { @"Ñ", 209 },
  85. { @"Ò", 210 },
  86. { @"Ó", 211 },
  87. { @"Ô", 212 },
  88. { @"Õ", 213 },
  89. { @"Ö", 214 },
  90. { @"×", 215 },
  91. { @"Ø", 216 },
  92. { @"Ù", 217 },
  93. { @"Ú", 218 },
  94. { @"Û", 219 },
  95. { @"Ü", 220 },
  96. { @"Ý", 221 },
  97. { @"Þ", 222 },
  98. { @"ß", 223 },
  99. { @"à", 224 },
  100. { @"á", 225 },
  101. { @"â", 226 },
  102. { @"ã", 227 },
  103. { @"ä", 228 },
  104. { @"å", 229 },
  105. { @"æ", 230 },
  106. { @"ç", 231 },
  107. { @"è", 232 },
  108. { @"é", 233 },
  109. { @"ê", 234 },
  110. { @"ë", 235 },
  111. { @"ì", 236 },
  112. { @"í", 237 },
  113. { @"î", 238 },
  114. { @"ï", 239 },
  115. { @"ð", 240 },
  116. { @"ñ", 241 },
  117. { @"ò", 242 },
  118. { @"ó", 243 },
  119. { @"ô", 244 },
  120. { @"õ", 245 },
  121. { @"ö", 246 },
  122. { @"÷", 247 },
  123. { @"ø", 248 },
  124. { @"ù", 249 },
  125. { @"ú", 250 },
  126. { @"û", 251 },
  127. { @"ü", 252 },
  128. { @"ý", 253 },
  129. { @"þ", 254 },
  130. { @"ÿ", 255 },
  131. // A.2.2. Special characters cont'd
  132. { @"Œ", 338 },
  133. { @"œ", 339 },
  134. { @"Š", 352 },
  135. { @"š", 353 },
  136. { @"Ÿ", 376 },
  137. // A.2.3. Symbols
  138. { @"ƒ", 402 },
  139. // A.2.2. Special characters cont'd
  140. { @"ˆ", 710 },
  141. { @"˜", 732 },
  142. // A.2.3. Symbols cont'd
  143. { @"Α", 913 },
  144. { @"Β", 914 },
  145. { @"Γ", 915 },
  146. { @"Δ", 916 },
  147. { @"Ε", 917 },
  148. { @"Ζ", 918 },
  149. { @"Η", 919 },
  150. { @"Θ", 920 },
  151. { @"Ι", 921 },
  152. { @"Κ", 922 },
  153. { @"Λ", 923 },
  154. { @"Μ", 924 },
  155. { @"Ν", 925 },
  156. { @"Ξ", 926 },
  157. { @"Ο", 927 },
  158. { @"Π", 928 },
  159. { @"Ρ", 929 },
  160. { @"Σ", 931 },
  161. { @"Τ", 932 },
  162. { @"Υ", 933 },
  163. { @"Φ", 934 },
  164. { @"Χ", 935 },
  165. { @"Ψ", 936 },
  166. { @"Ω", 937 },
  167. { @"α", 945 },
  168. { @"β", 946 },
  169. { @"γ", 947 },
  170. { @"δ", 948 },
  171. { @"ε", 949 },
  172. { @"ζ", 950 },
  173. { @"η", 951 },
  174. { @"θ", 952 },
  175. { @"ι", 953 },
  176. { @"κ", 954 },
  177. { @"λ", 955 },
  178. { @"μ", 956 },
  179. { @"ν", 957 },
  180. { @"ξ", 958 },
  181. { @"ο", 959 },
  182. { @"π", 960 },
  183. { @"ρ", 961 },
  184. { @"ς", 962 },
  185. { @"σ", 963 },
  186. { @"τ", 964 },
  187. { @"υ", 965 },
  188. { @"φ", 966 },
  189. { @"χ", 967 },
  190. { @"ψ", 968 },
  191. { @"ω", 969 },
  192. { @"ϑ", 977 },
  193. { @"ϒ", 978 },
  194. { @"ϖ", 982 },
  195. // A.2.2. Special characters cont'd
  196. { @" ", 8194 },
  197. { @" ", 8195 },
  198. { @" ", 8201 },
  199. { @"‌", 8204 },
  200. { @"‍", 8205 },
  201. { @"‎", 8206 },
  202. { @"‏", 8207 },
  203. { @"–", 8211 },
  204. { @"—", 8212 },
  205. { @"‘", 8216 },
  206. { @"’", 8217 },
  207. { @"‚", 8218 },
  208. { @"“", 8220 },
  209. { @"”", 8221 },
  210. { @"„", 8222 },
  211. { @"†", 8224 },
  212. { @"‡", 8225 },
  213. // A.2.3. Symbols cont'd
  214. { @"•", 8226 },
  215. { @"…", 8230 },
  216. // A.2.2. Special characters cont'd
  217. { @"‰", 8240 },
  218. // A.2.3. Symbols cont'd
  219. { @"′", 8242 },
  220. { @"″", 8243 },
  221. // A.2.2. Special characters cont'd
  222. { @"‹", 8249 },
  223. { @"›", 8250 },
  224. // A.2.3. Symbols cont'd
  225. { @"‾", 8254 },
  226. { @"⁄", 8260 },
  227. // A.2.2. Special characters cont'd
  228. { @"€", 8364 },
  229. // A.2.3. Symbols cont'd
  230. { @"ℑ", 8465 },
  231. { @"℘", 8472 },
  232. { @"ℜ", 8476 },
  233. { @"™", 8482 },
  234. { @"ℵ", 8501 },
  235. { @"←", 8592 },
  236. { @"↑", 8593 },
  237. { @"→", 8594 },
  238. { @"↓", 8595 },
  239. { @"↔", 8596 },
  240. { @"↵", 8629 },
  241. { @"⇐", 8656 },
  242. { @"⇑", 8657 },
  243. { @"⇒", 8658 },
  244. { @"⇓", 8659 },
  245. { @"⇔", 8660 },
  246. { @"∀", 8704 },
  247. { @"∂", 8706 },
  248. { @"∃", 8707 },
  249. { @"∅", 8709 },
  250. { @"∇", 8711 },
  251. { @"∈", 8712 },
  252. { @"∉", 8713 },
  253. { @"∋", 8715 },
  254. { @"∏", 8719 },
  255. { @"∑", 8721 },
  256. { @"−", 8722 },
  257. { @"∗", 8727 },
  258. { @"√", 8730 },
  259. { @"∝", 8733 },
  260. { @"∞", 8734 },
  261. { @"∠", 8736 },
  262. { @"∧", 8743 },
  263. { @"∨", 8744 },
  264. { @"∩", 8745 },
  265. { @"∪", 8746 },
  266. { @"∫", 8747 },
  267. { @"∴", 8756 },
  268. { @"∼", 8764 },
  269. { @"≅", 8773 },
  270. { @"≈", 8776 },
  271. { @"≠", 8800 },
  272. { @"≡", 8801 },
  273. { @"≤", 8804 },
  274. { @"≥", 8805 },
  275. { @"⊂", 8834 },
  276. { @"⊃", 8835 },
  277. { @"⊄", 8836 },
  278. { @"⊆", 8838 },
  279. { @"⊇", 8839 },
  280. { @"⊕", 8853 },
  281. { @"⊗", 8855 },
  282. { @"⊥", 8869 },
  283. { @"⋅", 8901 },
  284. { @"⌈", 8968 },
  285. { @"⌉", 8969 },
  286. { @"⌊", 8970 },
  287. { @"⌋", 8971 },
  288. { @"⟨", 9001 },
  289. { @"⟩", 9002 },
  290. { @"◊", 9674 },
  291. { @"♠", 9824 },
  292. { @"♣", 9827 },
  293. { @"♥", 9829 },
  294. { @"♦", 9830 }
  295. };
  296. // Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters
  297. // This is table A.2.2 Special Characters
  298. static HTMLEscapeMap gUnicodeHTMLEscapeMap[] = {
  299. // C0 Controls and Basic Latin
  300. { @""", 34 },
  301. { @"&", 38 },
  302. { @"'", 39 },
  303. { @"<", 60 },
  304. { @">", 62 },
  305. // Latin Extended-A
  306. { @"Œ", 338 },
  307. { @"œ", 339 },
  308. { @"Š", 352 },
  309. { @"š", 353 },
  310. { @"Ÿ", 376 },
  311. // Spacing Modifier Letters
  312. { @"ˆ", 710 },
  313. { @"˜", 732 },
  314. // General Punctuation
  315. { @" ", 8194 },
  316. { @" ", 8195 },
  317. { @" ", 8201 },
  318. { @"‌", 8204 },
  319. { @"‍", 8205 },
  320. { @"‎", 8206 },
  321. { @"‏", 8207 },
  322. { @"–", 8211 },
  323. { @"—", 8212 },
  324. { @"‘", 8216 },
  325. { @"’", 8217 },
  326. { @"‚", 8218 },
  327. { @"“", 8220 },
  328. { @"”", 8221 },
  329. { @"„", 8222 },
  330. { @"†", 8224 },
  331. { @"‡", 8225 },
  332. { @"‰", 8240 },
  333. { @"‹", 8249 },
  334. { @"›", 8250 },
  335. { @"€", 8364 },
  336. };
  337. // Utility function for Bsearching table above
  338. static int EscapeMapCompare(const void *ucharVoid, const void *mapVoid) {
  339. const unichar *uchar = (const unichar*)ucharVoid;
  340. const HTMLEscapeMap *map = (const HTMLEscapeMap*)mapVoid;
  341. int val;
  342. if (*uchar > map->uchar) {
  343. val = 1;
  344. } else if (*uchar < map->uchar) {
  345. val = -1;
  346. } else {
  347. val = 0;
  348. }
  349. return val;
  350. }
  351. @implementation NSString (GTMNSStringHTMLAdditions)
  352. - (NSString *)gtm_stringByEscapingHTMLUsingTable:(HTMLEscapeMap*)table
  353. ofSize:(NSUInteger)size
  354. escapingUnicode:(BOOL)escapeUnicode {
  355. NSUInteger length = [self length];
  356. if (!length) {
  357. return self;
  358. }
  359. NSMutableString *finalString = [NSMutableString string];
  360. NSMutableData *data2 = [NSMutableData dataWithCapacity:sizeof(unichar) * length];
  361. // this block is common between GTMNSString+HTML and GTMNSString+XML but
  362. // it's so short that it isn't really worth trying to share.
  363. const unichar *buffer = CFStringGetCharactersPtr((CFStringRef)self);
  364. if (!buffer) {
  365. // We want this buffer to be autoreleased.
  366. NSMutableData *data = [NSMutableData dataWithLength:length * sizeof(UniChar)];
  367. if (!data) {
  368. // COV_NF_START - Memory fail case
  369. _GTMDevLog(@"couldn't alloc buffer");
  370. return nil;
  371. // COV_NF_END
  372. }
  373. [self getCharacters:[data mutableBytes]];
  374. buffer = [data bytes];
  375. }
  376. if (!buffer || !data2) {
  377. // COV_NF_START
  378. _GTMDevLog(@"Unable to allocate buffer or data2");
  379. return nil;
  380. // COV_NF_END
  381. }
  382. unichar *buffer2 = (unichar *)[data2 mutableBytes];
  383. NSUInteger buffer2Length = 0;
  384. for (NSUInteger i = 0; i < length; ++i) {
  385. HTMLEscapeMap *val = bsearch(&buffer[i], table,
  386. size / sizeof(HTMLEscapeMap),
  387. sizeof(HTMLEscapeMap), EscapeMapCompare);
  388. if (val || (escapeUnicode && buffer[i] > 127)) {
  389. if (buffer2Length) {
  390. CFStringAppendCharacters((CFMutableStringRef)finalString,
  391. buffer2,
  392. buffer2Length);
  393. buffer2Length = 0;
  394. }
  395. if (val) {
  396. [finalString appendString:val->escapeSequence];
  397. }
  398. else {
  399. _GTMDevAssert(escapeUnicode && buffer[i] > 127, @"Illegal Character");
  400. [finalString appendFormat:@"&#%d;", buffer[i]];
  401. }
  402. } else {
  403. buffer2[buffer2Length] = buffer[i];
  404. buffer2Length += 1;
  405. }
  406. }
  407. if (buffer2Length) {
  408. CFStringAppendCharacters((CFMutableStringRef)finalString,
  409. buffer2,
  410. buffer2Length);
  411. }
  412. return finalString;
  413. }
  414. - (NSString *)gtm_stringByEscapingForHTML {
  415. return [self gtm_stringByEscapingHTMLUsingTable:gUnicodeHTMLEscapeMap
  416. ofSize:sizeof(gUnicodeHTMLEscapeMap)
  417. escapingUnicode:NO];
  418. } // gtm_stringByEscapingHTML
  419. - (NSString *)gtm_stringByEscapingForAsciiHTML {
  420. return [self gtm_stringByEscapingHTMLUsingTable:gAsciiHTMLEscapeMap
  421. ofSize:sizeof(gAsciiHTMLEscapeMap)
  422. escapingUnicode:YES];
  423. } // gtm_stringByEscapingAsciiHTML
  424. - (NSString *)gtm_stringByUnescapingFromHTML {
  425. NSRange range = NSMakeRange(0, [self length]);
  426. NSRange subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range];
  427. // if no ampersands, we've got a quick way out
  428. if (subrange.length == 0) return self;
  429. NSMutableString *finalString = [NSMutableString stringWithString:self];
  430. do {
  431. NSRange semiColonRange = NSMakeRange(subrange.location, NSMaxRange(range) - subrange.location);
  432. semiColonRange = [self rangeOfString:@";" options:0 range:semiColonRange];
  433. range = NSMakeRange(0, subrange.location);
  434. // if we don't find a semicolon in the range, we don't have a sequence
  435. if (semiColonRange.location == NSNotFound) {
  436. continue;
  437. }
  438. NSRange escapeRange = NSMakeRange(subrange.location, semiColonRange.location - subrange.location + 1);
  439. NSString *escapeString = [self substringWithRange:escapeRange];
  440. NSUInteger length = [escapeString length];
  441. // a squence must be longer than 3 (&lt;) and less than 11 (&thetasym;)
  442. if (length > 3 && length < 11) {
  443. if ([escapeString characterAtIndex:1] == '#') {
  444. unichar char2 = [escapeString characterAtIndex:2];
  445. if (char2 == 'x' || char2 == 'X') {
  446. // Hex escape squences &#xa3;
  447. NSString *hexSequence = [escapeString substringWithRange:NSMakeRange(3, length - 4)];
  448. NSScanner *scanner = [NSScanner scannerWithString:hexSequence];
  449. unsigned value;
  450. if ([scanner scanHexInt:&value] &&
  451. value < USHRT_MAX &&
  452. value > 0
  453. && [scanner scanLocation] == length - 4) {
  454. unichar uchar = value;
  455. NSString *charString = [NSString stringWithCharacters:&uchar length:1];
  456. [finalString replaceCharactersInRange:escapeRange withString:charString];
  457. }
  458. } else {
  459. // Decimal Sequences &#123;
  460. NSString *numberSequence = [escapeString substringWithRange:NSMakeRange(2, length - 3)];
  461. NSScanner *scanner = [NSScanner scannerWithString:numberSequence];
  462. int value;
  463. if ([scanner scanInt:&value] &&
  464. value < USHRT_MAX &&
  465. value > 0
  466. && [scanner scanLocation] == length - 3) {
  467. unichar uchar = value;
  468. NSString *charString = [NSString stringWithCharacters:&uchar length:1];
  469. [finalString replaceCharactersInRange:escapeRange withString:charString];
  470. }
  471. }
  472. } else {
  473. // "standard" sequences
  474. for (unsigned i = 0; i < sizeof(gAsciiHTMLEscapeMap) / sizeof(HTMLEscapeMap); ++i) {
  475. if ([escapeString isEqualToString:gAsciiHTMLEscapeMap[i].escapeSequence]) {
  476. [finalString replaceCharactersInRange:escapeRange withString:[NSString stringWithCharacters:&gAsciiHTMLEscapeMap[i].uchar length:1]];
  477. break;
  478. }
  479. }
  480. }
  481. }
  482. } while ((subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range]).length != 0);
  483. return finalString;
  484. } // gtm_stringByUnescapingHTML
  485. @end