//
//  GTMNSString+HTML.m
//  Dealing with NSStrings that contain HTML
//
//  Copyright 2006-2008 Google Inc.
//
//  Licensed under the Apache License, Version 2.0 (the "License"); you may not
//  use this file except in compliance with the License.  You may obtain a copy
//  of the License at
// 
//  http://www.apache.org/licenses/LICENSE-2.0
// 
//  Unless required by applicable law or agreed to in writing, software
//  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
//  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
//  License for the specific language governing permissions and limitations under
//  the License.
//

#import "GTMDefines.h"
#import "GTMNSString+HTML.h"

typedef struct {
	NSString *escapeSequence;
	unichar uchar;
} HTMLEscapeMap;

// Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters
// Ordered by uchar lowest to highest for bsearching
static HTMLEscapeMap gAsciiHTMLEscapeMap[] = {
	// A.2.2. Special characters
	{ @""", 34 },
	{ @"&", 38 },
	{ @"'", 39 },
	{ @"<", 60 },
	{ @">", 62 },
	
    // A.2.1. Latin-1 characters
	{ @" ", 160 }, 
	{ @"¡", 161 }, 
	{ @"¢", 162 }, 
	{ @"£", 163 }, 
	{ @"¤", 164 }, 
	{ @"¥", 165 }, 
	{ @"¦", 166 }, 
	{ @"§", 167 }, 
	{ @"¨", 168 }, 
	{ @"©", 169 }, 
	{ @"ª", 170 }, 
	{ @"«", 171 }, 
	{ @"¬", 172 }, 
	{ @"­", 173 }, 
	{ @"®", 174 }, 
	{ @"¯", 175 }, 
	{ @"°", 176 }, 
	{ @"±", 177 }, 
	{ @"²", 178 }, 
	{ @"³", 179 }, 
	{ @"´", 180 }, 
	{ @"µ", 181 }, 
	{ @"¶", 182 }, 
	{ @"·", 183 }, 
	{ @"¸", 184 }, 
	{ @"¹", 185 }, 
	{ @"º", 186 }, 
	{ @"»", 187 }, 
	{ @"¼", 188 }, 
	{ @"½", 189 }, 
	{ @"¾", 190 }, 
	{ @"¿", 191 }, 
	{ @"À", 192 }, 
	{ @"Á", 193 }, 
	{ @"Â", 194 }, 
	{ @"Ã", 195 }, 
	{ @"Ä", 196 }, 
	{ @"Å", 197 }, 
	{ @"Æ", 198 }, 
	{ @"Ç", 199 }, 
	{ @"È", 200 }, 
	{ @"É", 201 }, 
	{ @"Ê", 202 }, 
	{ @"Ë", 203 }, 
	{ @"Ì", 204 }, 
	{ @"Í", 205 }, 
	{ @"Î", 206 }, 
	{ @"Ï", 207 }, 
	{ @"Ð", 208 }, 
	{ @"Ñ", 209 }, 
	{ @"Ò", 210 }, 
	{ @"Ó", 211 }, 
	{ @"Ô", 212 }, 
	{ @"Õ", 213 }, 
	{ @"Ö", 214 }, 
	{ @"×", 215 }, 
	{ @"Ø", 216 }, 
	{ @"Ù", 217 }, 
	{ @"Ú", 218 }, 
	{ @"Û", 219 }, 
	{ @"Ü", 220 }, 
	{ @"Ý", 221 }, 
	{ @"Þ", 222 }, 
	{ @"ß", 223 }, 
	{ @"à", 224 }, 
	{ @"á", 225 }, 
	{ @"â", 226 }, 
	{ @"ã", 227 }, 
	{ @"ä", 228 }, 
	{ @"å", 229 }, 
	{ @"æ", 230 }, 
	{ @"ç", 231 }, 
	{ @"è", 232 }, 
	{ @"é", 233 }, 
	{ @"ê", 234 }, 
	{ @"ë", 235 }, 
	{ @"ì", 236 }, 
	{ @"í", 237 }, 
	{ @"î", 238 }, 
	{ @"ï", 239 }, 
	{ @"ð", 240 }, 
	{ @"ñ", 241 }, 
	{ @"ò", 242 }, 
	{ @"ó", 243 }, 
	{ @"ô", 244 }, 
	{ @"õ", 245 }, 
	{ @"ö", 246 }, 
	{ @"÷", 247 }, 
	{ @"ø", 248 }, 
	{ @"ù", 249 }, 
	{ @"ú", 250 }, 
	{ @"û", 251 }, 
	{ @"ü", 252 }, 
	{ @"ý", 253 }, 
	{ @"þ", 254 }, 
	{ @"ÿ", 255 },
	
	// A.2.2. Special characters cont'd
	{ @"Œ", 338 },
	{ @"œ", 339 },
	{ @"Š", 352 },
	{ @"š", 353 },
	{ @"Ÿ", 376 },
	
	// A.2.3. Symbols
	{ @"ƒ", 402 }, 
	
	// A.2.2. Special characters cont'd
	{ @"ˆ", 710 },
	{ @"˜", 732 },
	
	// A.2.3. Symbols cont'd
	{ @"Α", 913 }, 
	{ @"Β", 914 }, 
	{ @"Γ", 915 }, 
	{ @"Δ", 916 }, 
	{ @"Ε", 917 }, 
	{ @"Ζ", 918 }, 
	{ @"Η", 919 }, 
	{ @"Θ", 920 }, 
	{ @"Ι", 921 }, 
	{ @"Κ", 922 }, 
	{ @"Λ", 923 }, 
	{ @"Μ", 924 }, 
	{ @"Ν", 925 }, 
	{ @"Ξ", 926 }, 
	{ @"Ο", 927 }, 
	{ @"Π", 928 }, 
	{ @"Ρ", 929 }, 
	{ @"Σ", 931 }, 
	{ @"Τ", 932 }, 
	{ @"Υ", 933 }, 
	{ @"Φ", 934 }, 
	{ @"Χ", 935 }, 
	{ @"Ψ", 936 }, 
	{ @"Ω", 937 }, 
	{ @"α", 945 }, 
	{ @"β", 946 }, 
	{ @"γ", 947 }, 
	{ @"δ", 948 }, 
	{ @"ε", 949 }, 
	{ @"ζ", 950 }, 
	{ @"η", 951 }, 
	{ @"θ", 952 }, 
	{ @"ι", 953 }, 
	{ @"κ", 954 }, 
	{ @"λ", 955 }, 
	{ @"μ", 956 }, 
	{ @"ν", 957 }, 
	{ @"ξ", 958 }, 
	{ @"ο", 959 }, 
	{ @"π", 960 }, 
	{ @"ρ", 961 }, 
	{ @"ς", 962 }, 
	{ @"σ", 963 }, 
	{ @"τ", 964 }, 
	{ @"υ", 965 }, 
	{ @"φ", 966 }, 
	{ @"χ", 967 }, 
	{ @"ψ", 968 }, 
	{ @"ω", 969 }, 
	{ @"ϑ", 977 }, 
	{ @"ϒ", 978 }, 
	{ @"ϖ", 982 }, 
	
	// A.2.2. Special characters cont'd
	{ @" ", 8194 },
	{ @" ", 8195 },
	{ @" ", 8201 },
	{ @"‌", 8204 },
	{ @"‍", 8205 },
	{ @"‎", 8206 },
	{ @"‏", 8207 },
	{ @"–", 8211 },
	{ @"—", 8212 },
	{ @"‘", 8216 },
	{ @"’", 8217 },
	{ @"‚", 8218 },
	{ @"“", 8220 },
	{ @"”", 8221 },
	{ @"„", 8222 },
	{ @"†", 8224 },
	{ @"‡", 8225 },
    // A.2.3. Symbols cont'd  
	{ @"•", 8226 }, 
	{ @"…", 8230 }, 
	
	// A.2.2. Special characters cont'd
	{ @"‰", 8240 },
	
	// A.2.3. Symbols cont'd  
	{ @"′", 8242 }, 
	{ @"″", 8243 }, 
	
	// A.2.2. Special characters cont'd
	{ @"‹", 8249 },
	{ @"›", 8250 },
	
	// A.2.3. Symbols cont'd  
	{ @"‾", 8254 }, 
	{ @"⁄", 8260 }, 
	
	// A.2.2. Special characters cont'd
	{ @"€", 8364 },
	
	// A.2.3. Symbols cont'd  
	{ @"ℑ", 8465 },
	{ @"℘", 8472 }, 
	{ @"ℜ", 8476 }, 
	{ @"™", 8482 }, 
	{ @"ℵ", 8501 }, 
	{ @"←", 8592 }, 
	{ @"↑", 8593 }, 
	{ @"→", 8594 }, 
	{ @"↓", 8595 }, 
	{ @"↔", 8596 }, 
	{ @"↵", 8629 }, 
	{ @"⇐", 8656 }, 
	{ @"⇑", 8657 }, 
	{ @"⇒", 8658 }, 
	{ @"⇓", 8659 }, 
	{ @"⇔", 8660 }, 
	{ @"∀", 8704 }, 
	{ @"∂", 8706 }, 
	{ @"∃", 8707 }, 
	{ @"∅", 8709 }, 
	{ @"∇", 8711 }, 
	{ @"∈", 8712 }, 
	{ @"∉", 8713 }, 
	{ @"∋", 8715 }, 
	{ @"∏", 8719 }, 
	{ @"∑", 8721 }, 
	{ @"−", 8722 }, 
	{ @"∗", 8727 }, 
	{ @"√", 8730 }, 
	{ @"∝", 8733 }, 
	{ @"∞", 8734 }, 
	{ @"∠", 8736 }, 
	{ @"∧", 8743 }, 
	{ @"∨", 8744 }, 
	{ @"∩", 8745 }, 
	{ @"∪", 8746 }, 
	{ @"∫", 8747 }, 
	{ @"∴", 8756 }, 
	{ @"∼", 8764 }, 
	{ @"≅", 8773 }, 
	{ @"≈", 8776 }, 
	{ @"≠", 8800 }, 
	{ @"≡", 8801 }, 
	{ @"≤", 8804 }, 
	{ @"≥", 8805 }, 
	{ @"⊂", 8834 }, 
	{ @"⊃", 8835 }, 
	{ @"⊄", 8836 }, 
	{ @"⊆", 8838 }, 
	{ @"⊇", 8839 }, 
	{ @"⊕", 8853 }, 
	{ @"⊗", 8855 }, 
	{ @"⊥", 8869 }, 
	{ @"⋅", 8901 }, 
	{ @"⌈", 8968 }, 
	{ @"⌉", 8969 }, 
	{ @"⌊", 8970 }, 
	{ @"⌋", 8971 }, 
	{ @"⟨", 9001 }, 
	{ @"⟩", 9002 }, 
	{ @"◊", 9674 }, 
	{ @"♠", 9824 }, 
	{ @"♣", 9827 }, 
	{ @"♥", 9829 }, 
	{ @"♦", 9830 }
};

// Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters
// This is table A.2.2 Special Characters
static HTMLEscapeMap gUnicodeHTMLEscapeMap[] = {
	// C0 Controls and Basic Latin
	{ @""", 34 },
	{ @"&", 38 },
	{ @"'", 39 },
	{ @"<", 60 },
	{ @">", 62 },
	
	// Latin Extended-A
	{ @"Œ", 338 },
	{ @"œ", 339 },
	{ @"Š", 352 },
	{ @"š", 353 },
	{ @"Ÿ", 376 },
	
	// Spacing Modifier Letters
	{ @"ˆ", 710 },
	{ @"˜", 732 },
    
	// General Punctuation
	{ @" ", 8194 },
	{ @" ", 8195 },
	{ @" ", 8201 },
	{ @"‌", 8204 },
	{ @"‍", 8205 },
	{ @"‎", 8206 },
	{ @"‏", 8207 },
	{ @"–", 8211 },
	{ @"—", 8212 },
	{ @"‘", 8216 },
	{ @"’", 8217 },
	{ @"‚", 8218 },
	{ @"“", 8220 },
	{ @"”", 8221 },
	{ @"„", 8222 },
	{ @"†", 8224 },
	{ @"‡", 8225 },
	{ @"‰", 8240 },
	{ @"‹", 8249 },
	{ @"›", 8250 },
	{ @"€", 8364 },
};


// Utility function for Bsearching table above
static int EscapeMapCompare(const void *ucharVoid, const void *mapVoid) {
	const unichar *uchar = (const unichar*)ucharVoid;
	const HTMLEscapeMap *map = (const HTMLEscapeMap*)mapVoid;
	int val;
	if (*uchar > map->uchar) {
		val = 1;
	} else if (*uchar < map->uchar) {
		val = -1;
	} else {
		val = 0;
	}
	return val;
}

@implementation NSString (GTMNSStringHTMLAdditions)

- (NSString *)gtm_stringByEscapingHTMLUsingTable:(HTMLEscapeMap*)table 
                                          ofSize:(NSUInteger)size 
                                 escapingUnicode:(BOOL)escapeUnicode {  
	NSUInteger length = [self length];
	if (!length) {
		return self;
	}
	
	NSMutableString *finalString = [NSMutableString string];
	NSMutableData *data2 = [NSMutableData dataWithCapacity:sizeof(unichar) * length];
	
	// this block is common between GTMNSString+HTML and GTMNSString+XML but
	// it's so short that it isn't really worth trying to share.
	const unichar *buffer = CFStringGetCharactersPtr((CFStringRef)self);
	if (!buffer) {
		// We want this buffer to be autoreleased.
		NSMutableData *data = [NSMutableData dataWithLength:length * sizeof(UniChar)];
		if (!data) {
			// COV_NF_START  - Memory fail case
			_GTMDevLog(@"couldn't alloc buffer");
			return nil;
			// COV_NF_END
		}
		[self getCharacters:[data mutableBytes]];
		buffer = [data bytes];
	}
	
	if (!buffer || !data2) {
		// COV_NF_START
		_GTMDevLog(@"Unable to allocate buffer or data2");
		return nil;
		// COV_NF_END
	}
	
	unichar *buffer2 = (unichar *)[data2 mutableBytes];
	
	NSUInteger buffer2Length = 0;
	
	for (NSUInteger i = 0; i < length; ++i) {
		HTMLEscapeMap *val = bsearch(&buffer[i], table, 
									 size / sizeof(HTMLEscapeMap), 
									 sizeof(HTMLEscapeMap), EscapeMapCompare);
		if (val || (escapeUnicode && buffer[i] > 127)) {
			if (buffer2Length) {
				CFStringAppendCharacters((CFMutableStringRef)finalString, 
										 buffer2, 
										 buffer2Length);
				buffer2Length = 0;
			}
			if (val) {
				[finalString appendString:val->escapeSequence];
			}
			else {
				_GTMDevAssert(escapeUnicode && buffer[i] > 127, @"Illegal Character");
				[finalString appendFormat:@"&#%d;", buffer[i]];
			}
		} else {
			buffer2[buffer2Length] = buffer[i];
			buffer2Length += 1;
		}
	}
	if (buffer2Length) {
		CFStringAppendCharacters((CFMutableStringRef)finalString, 
								 buffer2, 
								 buffer2Length);
	}
	return finalString;
}

- (NSString *)gtm_stringByEscapingForHTML {
	return [self gtm_stringByEscapingHTMLUsingTable:gUnicodeHTMLEscapeMap 
											 ofSize:sizeof(gUnicodeHTMLEscapeMap) 
									escapingUnicode:NO];
} // gtm_stringByEscapingHTML

- (NSString *)gtm_stringByEscapingForAsciiHTML {
	return [self gtm_stringByEscapingHTMLUsingTable:gAsciiHTMLEscapeMap 
											 ofSize:sizeof(gAsciiHTMLEscapeMap) 
									escapingUnicode:YES];
} // gtm_stringByEscapingAsciiHTML

- (NSString *)gtm_stringByUnescapingFromHTML {
	NSRange range = NSMakeRange(0, [self length]);
	NSRange subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range];
	
	// if no ampersands, we've got a quick way out
	if (subrange.length == 0) return self;
	NSMutableString *finalString = [NSMutableString stringWithString:self];
	do {
		NSRange semiColonRange = NSMakeRange(subrange.location, NSMaxRange(range) - subrange.location);
		semiColonRange = [self rangeOfString:@";" options:0 range:semiColonRange];
		range = NSMakeRange(0, subrange.location);
		// if we don't find a semicolon in the range, we don't have a sequence
		if (semiColonRange.location == NSNotFound) {
			continue;
		}
		NSRange escapeRange = NSMakeRange(subrange.location, semiColonRange.location - subrange.location + 1);
		NSString *escapeString = [self substringWithRange:escapeRange];
		NSUInteger length = [escapeString length];
		// a squence must be longer than 3 (&lt;) and less than 11 (&thetasym;)
		if (length > 3 && length < 11) {
			if ([escapeString characterAtIndex:1] == '#') {
				unichar char2 = [escapeString characterAtIndex:2];
				if (char2 == 'x' || char2 == 'X') {
					// Hex escape squences &#xa3;
					NSString *hexSequence = [escapeString substringWithRange:NSMakeRange(3, length - 4)];
					NSScanner *scanner = [NSScanner scannerWithString:hexSequence];
					unsigned value;
					if ([scanner scanHexInt:&value] && 
						value < USHRT_MAX &&
						value > 0 
						&& [scanner scanLocation] == length - 4) {
						unichar uchar = value;
						NSString *charString = [NSString stringWithCharacters:&uchar length:1];
						[finalString replaceCharactersInRange:escapeRange withString:charString];
					}
					
				} else {
					// Decimal Sequences &#123;
					NSString *numberSequence = [escapeString substringWithRange:NSMakeRange(2, length - 3)];
					NSScanner *scanner = [NSScanner scannerWithString:numberSequence];
					int value;
					if ([scanner scanInt:&value] && 
						value < USHRT_MAX &&
						value > 0 
						&& [scanner scanLocation] == length - 3) {
						unichar uchar = value;
						NSString *charString = [NSString stringWithCharacters:&uchar length:1];
						[finalString replaceCharactersInRange:escapeRange withString:charString];
					}
				}
			} else {
				// "standard" sequences
				for (unsigned i = 0; i < sizeof(gAsciiHTMLEscapeMap) / sizeof(HTMLEscapeMap); ++i) {
					if ([escapeString isEqualToString:gAsciiHTMLEscapeMap[i].escapeSequence]) {
						[finalString replaceCharactersInRange:escapeRange withString:[NSString stringWithCharacters:&gAsciiHTMLEscapeMap[i].uchar length:1]];
						break;
					}
				}
			}
		}
	} while ((subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range]).length != 0);
	return finalString;
} // gtm_stringByUnescapingHTML



@end