123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522 |
- //
- // GTMNSString+HTML.m
- // Dealing with NSStrings that contain HTML
- //
- // Copyright 2006-2008 Google Inc.
- //
- // Licensed under the Apache License, Version 2.0 (the "License"); you may not
- // use this file except in compliance with the License. You may obtain a copy
- // of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- // License for the specific language governing permissions and limitations under
- // the License.
- //
- #import "GTMDefines.h"
- #import "GTMNSString+HTML.h"
- typedef struct {
- NSString *escapeSequence;
- unichar uchar;
- } HTMLEscapeMap;
- // Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters
- // Ordered by uchar lowest to highest for bsearching
- static HTMLEscapeMap gAsciiHTMLEscapeMap[] = {
- // A.2.2. Special characters
- { @""", 34 },
- { @"&", 38 },
- { @"'", 39 },
- { @"<", 60 },
- { @">", 62 },
-
- // A.2.1. Latin-1 characters
- { @" ", 160 },
- { @"¡", 161 },
- { @"¢", 162 },
- { @"£", 163 },
- { @"¤", 164 },
- { @"¥", 165 },
- { @"¦", 166 },
- { @"§", 167 },
- { @"¨", 168 },
- { @"©", 169 },
- { @"ª", 170 },
- { @"«", 171 },
- { @"¬", 172 },
- { @"­", 173 },
- { @"®", 174 },
- { @"¯", 175 },
- { @"°", 176 },
- { @"±", 177 },
- { @"²", 178 },
- { @"³", 179 },
- { @"´", 180 },
- { @"µ", 181 },
- { @"¶", 182 },
- { @"·", 183 },
- { @"¸", 184 },
- { @"¹", 185 },
- { @"º", 186 },
- { @"»", 187 },
- { @"¼", 188 },
- { @"½", 189 },
- { @"¾", 190 },
- { @"¿", 191 },
- { @"À", 192 },
- { @"Á", 193 },
- { @"Â", 194 },
- { @"Ã", 195 },
- { @"Ä", 196 },
- { @"Å", 197 },
- { @"Æ", 198 },
- { @"Ç", 199 },
- { @"È", 200 },
- { @"É", 201 },
- { @"Ê", 202 },
- { @"Ë", 203 },
- { @"Ì", 204 },
- { @"Í", 205 },
- { @"Î", 206 },
- { @"Ï", 207 },
- { @"Ð", 208 },
- { @"Ñ", 209 },
- { @"Ò", 210 },
- { @"Ó", 211 },
- { @"Ô", 212 },
- { @"Õ", 213 },
- { @"Ö", 214 },
- { @"×", 215 },
- { @"Ø", 216 },
- { @"Ù", 217 },
- { @"Ú", 218 },
- { @"Û", 219 },
- { @"Ü", 220 },
- { @"Ý", 221 },
- { @"Þ", 222 },
- { @"ß", 223 },
- { @"à", 224 },
- { @"á", 225 },
- { @"â", 226 },
- { @"ã", 227 },
- { @"ä", 228 },
- { @"å", 229 },
- { @"æ", 230 },
- { @"ç", 231 },
- { @"è", 232 },
- { @"é", 233 },
- { @"ê", 234 },
- { @"ë", 235 },
- { @"ì", 236 },
- { @"í", 237 },
- { @"î", 238 },
- { @"ï", 239 },
- { @"ð", 240 },
- { @"ñ", 241 },
- { @"ò", 242 },
- { @"ó", 243 },
- { @"ô", 244 },
- { @"õ", 245 },
- { @"ö", 246 },
- { @"÷", 247 },
- { @"ø", 248 },
- { @"ù", 249 },
- { @"ú", 250 },
- { @"û", 251 },
- { @"ü", 252 },
- { @"ý", 253 },
- { @"þ", 254 },
- { @"ÿ", 255 },
-
- // A.2.2. Special characters cont'd
- { @"Œ", 338 },
- { @"œ", 339 },
- { @"Š", 352 },
- { @"š", 353 },
- { @"Ÿ", 376 },
-
- // A.2.3. Symbols
- { @"ƒ", 402 },
-
- // A.2.2. Special characters cont'd
- { @"ˆ", 710 },
- { @"˜", 732 },
-
- // A.2.3. Symbols cont'd
- { @"Α", 913 },
- { @"Β", 914 },
- { @"Γ", 915 },
- { @"Δ", 916 },
- { @"Ε", 917 },
- { @"Ζ", 918 },
- { @"Η", 919 },
- { @"Θ", 920 },
- { @"Ι", 921 },
- { @"Κ", 922 },
- { @"Λ", 923 },
- { @"Μ", 924 },
- { @"Ν", 925 },
- { @"Ξ", 926 },
- { @"Ο", 927 },
- { @"Π", 928 },
- { @"Ρ", 929 },
- { @"Σ", 931 },
- { @"Τ", 932 },
- { @"Υ", 933 },
- { @"Φ", 934 },
- { @"Χ", 935 },
- { @"Ψ", 936 },
- { @"Ω", 937 },
- { @"α", 945 },
- { @"β", 946 },
- { @"γ", 947 },
- { @"δ", 948 },
- { @"ε", 949 },
- { @"ζ", 950 },
- { @"η", 951 },
- { @"θ", 952 },
- { @"ι", 953 },
- { @"κ", 954 },
- { @"λ", 955 },
- { @"μ", 956 },
- { @"ν", 957 },
- { @"ξ", 958 },
- { @"ο", 959 },
- { @"π", 960 },
- { @"ρ", 961 },
- { @"ς", 962 },
- { @"σ", 963 },
- { @"τ", 964 },
- { @"υ", 965 },
- { @"φ", 966 },
- { @"χ", 967 },
- { @"ψ", 968 },
- { @"ω", 969 },
- { @"ϑ", 977 },
- { @"ϒ", 978 },
- { @"ϖ", 982 },
-
- // A.2.2. Special characters cont'd
- { @" ", 8194 },
- { @" ", 8195 },
- { @" ", 8201 },
- { @"‌", 8204 },
- { @"‍", 8205 },
- { @"‎", 8206 },
- { @"‏", 8207 },
- { @"–", 8211 },
- { @"—", 8212 },
- { @"‘", 8216 },
- { @"’", 8217 },
- { @"‚", 8218 },
- { @"“", 8220 },
- { @"”", 8221 },
- { @"„", 8222 },
- { @"†", 8224 },
- { @"‡", 8225 },
- // A.2.3. Symbols cont'd
- { @"•", 8226 },
- { @"…", 8230 },
-
- // A.2.2. Special characters cont'd
- { @"‰", 8240 },
-
- // A.2.3. Symbols cont'd
- { @"′", 8242 },
- { @"″", 8243 },
-
- // A.2.2. Special characters cont'd
- { @"‹", 8249 },
- { @"›", 8250 },
-
- // A.2.3. Symbols cont'd
- { @"‾", 8254 },
- { @"⁄", 8260 },
-
- // A.2.2. Special characters cont'd
- { @"€", 8364 },
-
- // A.2.3. Symbols cont'd
- { @"ℑ", 8465 },
- { @"℘", 8472 },
- { @"ℜ", 8476 },
- { @"™", 8482 },
- { @"ℵ", 8501 },
- { @"←", 8592 },
- { @"↑", 8593 },
- { @"→", 8594 },
- { @"↓", 8595 },
- { @"↔", 8596 },
- { @"↵", 8629 },
- { @"⇐", 8656 },
- { @"⇑", 8657 },
- { @"⇒", 8658 },
- { @"⇓", 8659 },
- { @"⇔", 8660 },
- { @"∀", 8704 },
- { @"∂", 8706 },
- { @"∃", 8707 },
- { @"∅", 8709 },
- { @"∇", 8711 },
- { @"∈", 8712 },
- { @"∉", 8713 },
- { @"∋", 8715 },
- { @"∏", 8719 },
- { @"∑", 8721 },
- { @"−", 8722 },
- { @"∗", 8727 },
- { @"√", 8730 },
- { @"∝", 8733 },
- { @"∞", 8734 },
- { @"∠", 8736 },
- { @"∧", 8743 },
- { @"∨", 8744 },
- { @"∩", 8745 },
- { @"∪", 8746 },
- { @"∫", 8747 },
- { @"∴", 8756 },
- { @"∼", 8764 },
- { @"≅", 8773 },
- { @"≈", 8776 },
- { @"≠", 8800 },
- { @"≡", 8801 },
- { @"≤", 8804 },
- { @"≥", 8805 },
- { @"⊂", 8834 },
- { @"⊃", 8835 },
- { @"⊄", 8836 },
- { @"⊆", 8838 },
- { @"⊇", 8839 },
- { @"⊕", 8853 },
- { @"⊗", 8855 },
- { @"⊥", 8869 },
- { @"⋅", 8901 },
- { @"⌈", 8968 },
- { @"⌉", 8969 },
- { @"⌊", 8970 },
- { @"⌋", 8971 },
- { @"⟨", 9001 },
- { @"⟩", 9002 },
- { @"◊", 9674 },
- { @"♠", 9824 },
- { @"♣", 9827 },
- { @"♥", 9829 },
- { @"♦", 9830 }
- };
- // Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters
- // This is table A.2.2 Special Characters
- static HTMLEscapeMap gUnicodeHTMLEscapeMap[] = {
- // C0 Controls and Basic Latin
- { @""", 34 },
- { @"&", 38 },
- { @"'", 39 },
- { @"<", 60 },
- { @">", 62 },
-
- // Latin Extended-A
- { @"Œ", 338 },
- { @"œ", 339 },
- { @"Š", 352 },
- { @"š", 353 },
- { @"Ÿ", 376 },
-
- // Spacing Modifier Letters
- { @"ˆ", 710 },
- { @"˜", 732 },
-
- // General Punctuation
- { @" ", 8194 },
- { @" ", 8195 },
- { @" ", 8201 },
- { @"‌", 8204 },
- { @"‍", 8205 },
- { @"‎", 8206 },
- { @"‏", 8207 },
- { @"–", 8211 },
- { @"—", 8212 },
- { @"‘", 8216 },
- { @"’", 8217 },
- { @"‚", 8218 },
- { @"“", 8220 },
- { @"”", 8221 },
- { @"„", 8222 },
- { @"†", 8224 },
- { @"‡", 8225 },
- { @"‰", 8240 },
- { @"‹", 8249 },
- { @"›", 8250 },
- { @"€", 8364 },
- };
- // Utility function for Bsearching table above
- static int EscapeMapCompare(const void *ucharVoid, const void *mapVoid) {
- const unichar *uchar = (const unichar*)ucharVoid;
- const HTMLEscapeMap *map = (const HTMLEscapeMap*)mapVoid;
- int val;
- if (*uchar > map->uchar) {
- val = 1;
- } else if (*uchar < map->uchar) {
- val = -1;
- } else {
- val = 0;
- }
- return val;
- }
- @implementation NSString (GTMNSStringHTMLAdditions)
- - (NSString *)gtm_stringByEscapingHTMLUsingTable:(HTMLEscapeMap*)table
- ofSize:(NSUInteger)size
- escapingUnicode:(BOOL)escapeUnicode {
- NSUInteger length = [self length];
- if (!length) {
- return self;
- }
-
- NSMutableString *finalString = [NSMutableString string];
- NSMutableData *data2 = [NSMutableData dataWithCapacity:sizeof(unichar) * length];
-
- // this block is common between GTMNSString+HTML and GTMNSString+XML but
- // it's so short that it isn't really worth trying to share.
- const unichar *buffer = CFStringGetCharactersPtr((CFStringRef)self);
- if (!buffer) {
- // We want this buffer to be autoreleased.
- NSMutableData *data = [NSMutableData dataWithLength:length * sizeof(UniChar)];
- if (!data) {
- // COV_NF_START - Memory fail case
- _GTMDevLog(@"couldn't alloc buffer");
- return nil;
- // COV_NF_END
- }
- [self getCharacters:[data mutableBytes]];
- buffer = [data bytes];
- }
-
- if (!buffer || !data2) {
- // COV_NF_START
- _GTMDevLog(@"Unable to allocate buffer or data2");
- return nil;
- // COV_NF_END
- }
-
- unichar *buffer2 = (unichar *)[data2 mutableBytes];
-
- NSUInteger buffer2Length = 0;
-
- for (NSUInteger i = 0; i < length; ++i) {
- HTMLEscapeMap *val = bsearch(&buffer[i], table,
- size / sizeof(HTMLEscapeMap),
- sizeof(HTMLEscapeMap), EscapeMapCompare);
- if (val || (escapeUnicode && buffer[i] > 127)) {
- if (buffer2Length) {
- CFStringAppendCharacters((CFMutableStringRef)finalString,
- buffer2,
- buffer2Length);
- buffer2Length = 0;
- }
- if (val) {
- [finalString appendString:val->escapeSequence];
- }
- else {
- _GTMDevAssert(escapeUnicode && buffer[i] > 127, @"Illegal Character");
- [finalString appendFormat:@"&#%d;", buffer[i]];
- }
- } else {
- buffer2[buffer2Length] = buffer[i];
- buffer2Length += 1;
- }
- }
- if (buffer2Length) {
- CFStringAppendCharacters((CFMutableStringRef)finalString,
- buffer2,
- buffer2Length);
- }
- return finalString;
- }
- - (NSString *)gtm_stringByEscapingForHTML {
- return [self gtm_stringByEscapingHTMLUsingTable:gUnicodeHTMLEscapeMap
- ofSize:sizeof(gUnicodeHTMLEscapeMap)
- escapingUnicode:NO];
- } // gtm_stringByEscapingHTML
- - (NSString *)gtm_stringByEscapingForAsciiHTML {
- return [self gtm_stringByEscapingHTMLUsingTable:gAsciiHTMLEscapeMap
- ofSize:sizeof(gAsciiHTMLEscapeMap)
- escapingUnicode:YES];
- } // gtm_stringByEscapingAsciiHTML
- - (NSString *)gtm_stringByUnescapingFromHTML {
- NSRange range = NSMakeRange(0, [self length]);
- NSRange subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range];
-
- // if no ampersands, we've got a quick way out
- if (subrange.length == 0) return self;
- NSMutableString *finalString = [NSMutableString stringWithString:self];
- do {
- NSRange semiColonRange = NSMakeRange(subrange.location, NSMaxRange(range) - subrange.location);
- semiColonRange = [self rangeOfString:@";" options:0 range:semiColonRange];
- range = NSMakeRange(0, subrange.location);
- // if we don't find a semicolon in the range, we don't have a sequence
- if (semiColonRange.location == NSNotFound) {
- continue;
- }
- NSRange escapeRange = NSMakeRange(subrange.location, semiColonRange.location - subrange.location + 1);
- NSString *escapeString = [self substringWithRange:escapeRange];
- NSUInteger length = [escapeString length];
- // a squence must be longer than 3 (<) and less than 11 (ϑ)
- if (length > 3 && length < 11) {
- if ([escapeString characterAtIndex:1] == '#') {
- unichar char2 = [escapeString characterAtIndex:2];
- if (char2 == 'x' || char2 == 'X') {
- // Hex escape squences £
- NSString *hexSequence = [escapeString substringWithRange:NSMakeRange(3, length - 4)];
- NSScanner *scanner = [NSScanner scannerWithString:hexSequence];
- unsigned value;
- if ([scanner scanHexInt:&value] &&
- value < USHRT_MAX &&
- value > 0
- && [scanner scanLocation] == length - 4) {
- unichar uchar = value;
- NSString *charString = [NSString stringWithCharacters:&uchar length:1];
- [finalString replaceCharactersInRange:escapeRange withString:charString];
- }
-
- } else {
- // Decimal Sequences {
- NSString *numberSequence = [escapeString substringWithRange:NSMakeRange(2, length - 3)];
- NSScanner *scanner = [NSScanner scannerWithString:numberSequence];
- int value;
- if ([scanner scanInt:&value] &&
- value < USHRT_MAX &&
- value > 0
- && [scanner scanLocation] == length - 3) {
- unichar uchar = value;
- NSString *charString = [NSString stringWithCharacters:&uchar length:1];
- [finalString replaceCharactersInRange:escapeRange withString:charString];
- }
- }
- } else {
- // "standard" sequences
- for (unsigned i = 0; i < sizeof(gAsciiHTMLEscapeMap) / sizeof(HTMLEscapeMap); ++i) {
- if ([escapeString isEqualToString:gAsciiHTMLEscapeMap[i].escapeSequence]) {
- [finalString replaceCharactersInRange:escapeRange withString:[NSString stringWithCharacters:&gAsciiHTMLEscapeMap[i].uchar length:1]];
- break;
- }
- }
- }
- }
- } while ((subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range]).length != 0);
- return finalString;
- } // gtm_stringByUnescapingHTML
- @end
|