//
//  NSString+HTML.m
//  MWFeedParser
//
//  Copyright (c) 2010 Michael Waterfall
//  
//  Permission is hereby granted, free of charge, to any person obtaining a copy
//  of this software and associated documentation files (the "Software"), to deal
//  in the Software without restriction, including without limitation the rights
//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
//  copies of the Software, and to permit persons to whom the Software is
//  furnished to do so, subject to the following conditions:
//  
//  1. The above copyright notice and this permission notice shall be included
//     in all copies or substantial portions of the Software.
//  
//  2. This Software cannot be used to archive or collect data such as (but not
//     limited to) that of events, news, experiences and activities, for the 
//     purpose of any concept relating to diary/journal keeping.
//  
//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
//  THE SOFTWARE.
//

#import "NSString+HTML.h"
#import "GTMNSString+HTML.h"

@implementation NSString (HTML)

#pragma mark -
#pragma mark Class Methods

#pragma mark -
#pragma mark Instance Methods

// Strip HTML tags
- (NSString *)stringByConvertingHTMLToPlainText {
	
	// Pool
	NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
	
	// Character sets
	NSCharacterSet *stopCharacters = [NSCharacterSet characterSetWithCharactersInString:[NSString stringWithFormat:@"< \t\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]];
	NSCharacterSet *newLineAndWhitespaceCharacters = [NSCharacterSet characterSetWithCharactersInString:[NSString stringWithFormat:@" \t\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]];
	NSCharacterSet *tagNameCharacters = [NSCharacterSet characterSetWithCharactersInString:@"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"]; /**/
	
	// Scan and find all tags
	NSMutableString *result = [[NSMutableString alloc] initWithCapacity:self.length];
	NSScanner *scanner = [[NSScanner alloc] initWithString:self];
	[scanner setCharactersToBeSkipped:nil];
	[scanner setCaseSensitive:YES];
	NSString *str = nil, *tagName = nil;
	BOOL dontReplaceTagWithSpace = NO;
	do {
		
		// Scan up to the start of a tag or whitespace
		if ([scanner scanUpToCharactersFromSet:stopCharacters intoString:&str]) {
			[result appendString:str];
			str = nil; // reset
		}
		
		// Check if we've stopped at a tag/comment or whitespace
		if ([scanner scanString:@"<" intoString:NULL]) {
			
			// Stopped at a comment or tag
			if ([scanner scanString:@"!--" intoString:NULL]) {
				
				// Comment
				[scanner scanUpToString:@"-->" intoString:NULL];
                
				[scanner scanString:@"-->" intoString:NULL];
           
			} else {
				
				// Tag - remove and replace with space unless it's
                if ([scanner scanString:@"/p>" intoString:NULL]) {
                    [result appendString:@"\n"];
                    [result appendString:@"  "];
                    
                }
                if ([scanner scanString:@"/h" intoString:NULL]) {
                    [result appendString:@"\n"];
                }
                if ([scanner scanString:@"img" intoString:NULL]) {
                    [scanner scanUpToString:@"src" intoString:NULL];
                    [scanner scanString:@"src" intoString:NULL];
                    [scanner scanString:@"=" intoString:NULL];
                    [scanner scanString:@"\'" intoString:NULL];
                    [scanner scanString:@"\"" intoString:NULL];
                    NSString *imgString;
                    if ([scanner scanUpToCharactersFromSet:[NSCharacterSet characterSetWithCharactersInString:@"\"\'"] intoString:&imgString]) {
                        [result appendString:[NSString stringWithFormat:@"\n<img>%@</img>\n",imgString]];
                        imgString = nil; // reset
                    }
                    
                }
                if ([scanner scanString:@"title" intoString:NULL]) {
                    [scanner scanUpToString:@"</title>" intoString:NULL];
                    [scanner scanString:@"</title>" intoString:NULL];
                }
				// a closing inline tag then dont replace with a space
				if ([scanner scanString:@"/" intoString:NULL]) {
                    
					
					// Closing tag - replace with space unless it's inline
					tagName = nil; dontReplaceTagWithSpace = NO;
					if ([scanner scanCharactersFromSet:tagNameCharacters intoString:&tagName]) {
						tagName = [tagName lowercaseString];
						dontReplaceTagWithSpace = ([tagName isEqualToString:@"a"] ||
												   [tagName isEqualToString:@"b"] ||
												   [tagName isEqualToString:@"i"] ||
												   [tagName isEqualToString:@"q"] ||
												   [tagName isEqualToString:@"span"] ||
												   [tagName isEqualToString:@"em"] ||
												   [tagName isEqualToString:@"strong"] ||
												   [tagName isEqualToString:@"cite"] ||
												   [tagName isEqualToString:@"abbr"] ||
												   [tagName isEqualToString:@"acronym"] ||
												   [tagName isEqualToString:@"label"]);
					}
					
					// Replace tag with string unless it was an inline
					if (!dontReplaceTagWithSpace && result.length > 0 && ![scanner isAtEnd]) [result appendString:@" "];
					
				}
				
				// Scan past tag
				[scanner scanUpToString:@">" intoString:NULL];
                
				[scanner scanString:@">" intoString:NULL];
				
			}
			
		} else {
			
			// Stopped at whitespace - replace all whitespace and newlines with a space
			if ([scanner scanCharactersFromSet:newLineAndWhitespaceCharacters intoString:NULL]) {
				if (result.length > 0 && ![scanner isAtEnd]) [result appendString:@" "]; // Dont append space to beginning or end of result
			}
			
		}
		
	} while (![scanner isAtEnd]);
	
	// Cleanup
	[scanner release];
	
	// Decode HTML entities and return
	NSString *retString = [[result stringByDecodingHTMLEntities] retain];
	[result release];
	
	// Drain
	[pool drain];
	
	// Return
	return [retString autorelease];
	
}

// Decode all HTML entities using GTM
- (NSString *)stringByDecodingHTMLEntities {
	// gtm_stringByUnescapingFromHTML can return self so create new string ;)
	return [NSString stringWithString:[self gtm_stringByUnescapingFromHTML]]; 
}

// Encode all HTML entities using GTM
- (NSString *)stringByEncodingHTMLEntities {
	// gtm_stringByUnescapingFromHTML can return self so create new string ;)
	return [NSString stringWithString:[self gtm_stringByEscapingForAsciiHTML]];
}

// Replace newlines with <br /> tags
- (NSString *)stringWithNewLinesAsBRs {
	
	// Pool
	NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
	
	// Strange New lines:
	//	Next Line, U+0085
	//	Form Feed, U+000C
	//	Line Separator, U+2028
	//	Paragraph Separator, U+2029
	
	// Scanner
	NSScanner *scanner = [[NSScanner alloc] initWithString:self];
	[scanner setCharactersToBeSkipped:nil];
	NSMutableString *result = [[NSMutableString alloc] init];
	NSString *temp;
	NSCharacterSet *newLineCharacters = [NSCharacterSet characterSetWithCharactersInString:
										 [NSString stringWithFormat:@"\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]];
	// Scan
	do {
		
		// Get non new line characters
		temp = nil;
		[scanner scanUpToCharactersFromSet:newLineCharacters intoString:&temp];
		if (temp) [result appendString:temp];
		temp = nil;
		
		// Add <br /> s
		if ([scanner scanString:@"\r\n" intoString:nil]) {
			
			// Combine \r\n into just 1 <br />
			[result appendString:@"<br />"];
			
		} else if ([scanner scanCharactersFromSet:newLineCharacters intoString:&temp]) {
			
			// Scan other new line characters and add <br /> s
			if (temp) {
				for (int i = 0; i < temp.length; i++) {
					[result appendString:@"<br />"];
				}
			}
			
		}
		
	} while (![scanner isAtEnd]);
	
	// Cleanup & return
	[scanner release];
	NSString *retString = [[NSString stringWithString:result] retain];
	[result release];
	
	// Drain
	[pool drain];
	
	// Return
	return [retString autorelease];
	
}

// Remove newlines and white space from strong
- (NSString *)stringByRemovingNewLinesAndWhitespace {
	
	// Pool
	NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
	
	// Strange New lines:
	//	Next Line, U+0085
	//	Form Feed, U+000C
	//	Line Separator, U+2028
	//	Paragraph Separator, U+2029
	
	// Scanner
	NSScanner *scanner = [[NSScanner alloc] initWithString:self];
	[scanner setCharactersToBeSkipped:nil];
	NSMutableString *result = [[NSMutableString alloc] init];
	NSString *temp;
	NSCharacterSet *newLineAndWhitespaceCharacters = [NSCharacterSet characterSetWithCharactersInString:
													  [NSString stringWithFormat:@" \t\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]];
	// Scan
	while (![scanner isAtEnd]) {
		
		// Get non new line or whitespace characters
		temp = nil;
		[scanner scanUpToCharactersFromSet:newLineAndWhitespaceCharacters intoString:&temp];
		if (temp) [result appendString:temp];
		
		// Replace with a space
		if ([scanner scanCharactersFromSet:newLineAndWhitespaceCharacters intoString:NULL]) {
			if (result.length > 0 && ![scanner isAtEnd]) // Dont append space to beginning or end of result
				[result appendString:@" "];
		}
		
	}
	
	// Cleanup
	[scanner release];
	
	// Return
	NSString *retString = [[NSString stringWithString:result] retain];
	[result release];
	
	// Drain
	[pool drain];
	
	// Return
	return [retString autorelease];
	
}

// Strip HTML tags
// DEPRECIATED - Please use NSString stringByConvertingHTMLToPlainText
- (NSString *)stringByStrippingTags {
	
	// Pool
	NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
	
	// Find first & and short-cut if we can
	NSUInteger ampIndex = [self rangeOfString:@"<" options:NSLiteralSearch].location;
	if (ampIndex == NSNotFound) {
		return [NSString stringWithString:self]; // return copy of string as no tags found
	}
	
	// Scan and find all tags
	NSScanner *scanner = [NSScanner scannerWithString:self];
	[scanner setCharactersToBeSkipped:nil];
	NSMutableSet *tags = [[NSMutableSet alloc] init];
	NSString *tag;
	do {
		
		// Scan up to <
		tag = nil;
		[scanner scanUpToString:@"<" intoString:NULL];
		[scanner scanUpToString:@">" intoString:&tag];
		
		// Add to set
		if (tag) {
			NSString *t = [[NSString alloc] initWithFormat:@"%@>", tag];
			[tags addObject:t];
			[t release];
		}
		
	} while (![scanner isAtEnd]);
	
	// Strings
	NSMutableString *result = [[NSMutableString alloc] initWithString:self];
	NSString *finalString;
	
	// Replace tags
	NSString *replacement;
	for (NSString *t in tags) {
		
		// Replace tag with space unless it's an inline element
		replacement = @" ";
		if ([t isEqualToString:@"<a>"] ||
			[t isEqualToString:@"</a>"] ||
			[t isEqualToString:@"<span>"] ||
			[t isEqualToString:@"</span>"] ||
			[t isEqualToString:@"<strong>"] ||
			[t isEqualToString:@"</strong>"] ||
			[t isEqualToString:@"<em>"] ||
			[t isEqualToString:@"</em>"]) {
			replacement = @"";
		}
		
		// Replace
		[result replaceOccurrencesOfString:t 
								withString:replacement
								   options:NSLiteralSearch 
									 range:NSMakeRange(0, result.length)];
	}
	
	// Remove multi-spaces and line breaks
	finalString = [[result stringByRemovingNewLinesAndWhitespace] retain];
	
	// Cleanup
	[result release];
	[tags release];
	
	// Drain
	[pool drain];
	
	// Return
    return [finalString autorelease];
	
}

/**
 过滤HTML字符串中的图片指定宽度
 
 @param width 宽度
 @return result
 */
- (NSString *)htmlWebAutoImageSizeWidth:(CGFloat)width{
    if (self == nil || self.length == 0) {
        return @"";
    }
    NSString *content = [self stringByReplacingOccurrencesOfString:@"&amp;quot" withString:@"'"];
    content = [content stringByReplacingOccurrencesOfString:@"&lt;" withString:@"<"];
    content = [content stringByReplacingOccurrencesOfString:@"&gt;" withString:@">"];
    content = [content stringByReplacingOccurrencesOfString:@"&quot;" withString:@"\""];
    
    
    NSString *html = content;
    NSString * regExpStr = @"<(img|IMG)[^\\<\\>]*>";
    NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:regExpStr options:NSRegularExpressionCaseInsensitive error:nil];
    
    NSArray *matches = [regex matchesInString:html
                                      options:0
                                        range:NSMakeRange(0, [html length])];
    
    
    //HTML中的<img ...... />数组
    NSMutableArray *imgArray = [NSMutableArray array];
    //<img src="URL"/>中的URL数组
    NSMutableArray *urlArray = [NSMutableArray array];
    
    for (NSTextCheckingResult *result in matches) {
        NSRange range   = result.range;
        NSString *group = [html substringWithRange:range];
        NSRange srange1 = [group rangeOfString:@"http"];
        NSString *tempString1 = [group substringWithRange:NSMakeRange(srange1.location, group.length - srange1.location)];
        NSRange srange2 = [tempString1 rangeOfString:@"\""];
        NSString *tempString2 = [tempString1 substringWithRange:NSMakeRange(0,srange2.location)];
        [urlArray addObject:tempString2];
        [imgArray addObject:group];
    }
    
    for (int i = 0; i < imgArray.count; i++) {
        NSString *string = imgArray[i];
        html = [html stringByReplacingOccurrencesOfString:string withString:[NSString stringWithFormat:@"<img src=\"%@\" title=\"\" alt=\"%lld\" width=\"%f\" height=\"auto\">",urlArray[i],[NSDate timeIntervalSinceReferenceDate]+i,width]];
    }
    
    
    return html;
}
@end