//
// NSString+HTML.m
// MWFeedParser
//
// Copyright (c) 2010 Michael Waterfall
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// 1. The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// 2. This Software cannot be used to archive or collect data such as (but not
// limited to) that of events, news, experiences and activities, for the
// purpose of any concept relating to diary/journal keeping.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
#import "NSString+HTML.h"
#import "GTMNSString+HTML.h"
@implementation NSString (HTML)
#pragma mark -
#pragma mark Class Methods
#pragma mark -
#pragma mark Instance Methods
// Strip HTML tags
- (NSString *)stringByConvertingHTMLToPlainText {
// Pool
NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
// Character sets
NSCharacterSet *stopCharacters = [NSCharacterSet characterSetWithCharactersInString:[NSString stringWithFormat:@"< \t\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]];
NSCharacterSet *newLineAndWhitespaceCharacters = [NSCharacterSet characterSetWithCharactersInString:[NSString stringWithFormat:@" \t\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]];
NSCharacterSet *tagNameCharacters = [NSCharacterSet characterSetWithCharactersInString:@"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"]; /**/
// Scan and find all tags
NSMutableString *result = [[NSMutableString alloc] initWithCapacity:self.length];
NSScanner *scanner = [[NSScanner alloc] initWithString:self];
[scanner setCharactersToBeSkipped:nil];
[scanner setCaseSensitive:YES];
NSString *str = nil, *tagName = nil;
BOOL dontReplaceTagWithSpace = NO;
do {
// Scan up to the start of a tag or whitespace
if ([scanner scanUpToCharactersFromSet:stopCharacters intoString:&str]) {
[result appendString:str];
str = nil; // reset
}
// Check if we've stopped at a tag/comment or whitespace
if ([scanner scanString:@"<" intoString:NULL]) {
// Stopped at a comment or tag
if ([scanner scanString:@"!--" intoString:NULL]) {
// Comment
[scanner scanUpToString:@"-->" intoString:NULL];
[scanner scanString:@"-->" intoString:NULL];
} else {
// Tag - remove and replace with space unless it's
if ([scanner scanString:@"/p>" intoString:NULL]) {
[result appendString:@"\n"];
[result appendString:@" "];
}
if ([scanner scanString:@"/h" intoString:NULL]) {
[result appendString:@"\n"];
}
if ([scanner scanString:@"img" intoString:NULL]) {
[scanner scanUpToString:@"src" intoString:NULL];
[scanner scanString:@"src" intoString:NULL];
[scanner scanString:@"=" intoString:NULL];
[scanner scanString:@"\'" intoString:NULL];
[scanner scanString:@"\"" intoString:NULL];
NSString *imgString;
if ([scanner scanUpToCharactersFromSet:[NSCharacterSet characterSetWithCharactersInString:@"\"\'"] intoString:&imgString]) {
[result appendString:[NSString stringWithFormat:@"\n%@\n",imgString]];
imgString = nil; // reset
}
}
if ([scanner scanString:@"title" intoString:NULL]) {
[scanner scanUpToString:@"" intoString:NULL];
[scanner scanString:@"" intoString:NULL];
}
// a closing inline tag then dont replace with a space
if ([scanner scanString:@"/" intoString:NULL]) {
// Closing tag - replace with space unless it's inline
tagName = nil; dontReplaceTagWithSpace = NO;
if ([scanner scanCharactersFromSet:tagNameCharacters intoString:&tagName]) {
tagName = [tagName lowercaseString];
dontReplaceTagWithSpace = ([tagName isEqualToString:@"a"] ||
[tagName isEqualToString:@"b"] ||
[tagName isEqualToString:@"i"] ||
[tagName isEqualToString:@"q"] ||
[tagName isEqualToString:@"span"] ||
[tagName isEqualToString:@"em"] ||
[tagName isEqualToString:@"strong"] ||
[tagName isEqualToString:@"cite"] ||
[tagName isEqualToString:@"abbr"] ||
[tagName isEqualToString:@"acronym"] ||
[tagName isEqualToString:@"label"]);
}
// Replace tag with string unless it was an inline
if (!dontReplaceTagWithSpace && result.length > 0 && ![scanner isAtEnd]) [result appendString:@" "];
}
// Scan past tag
[scanner scanUpToString:@">" intoString:NULL];
[scanner scanString:@">" intoString:NULL];
}
} else {
// Stopped at whitespace - replace all whitespace and newlines with a space
if ([scanner scanCharactersFromSet:newLineAndWhitespaceCharacters intoString:NULL]) {
if (result.length > 0 && ![scanner isAtEnd]) [result appendString:@" "]; // Dont append space to beginning or end of result
}
}
} while (![scanner isAtEnd]);
// Cleanup
[scanner release];
// Decode HTML entities and return
NSString *retString = [[result stringByDecodingHTMLEntities] retain];
[result release];
// Drain
[pool drain];
// Return
return [retString autorelease];
}
// Decode all HTML entities using GTM
- (NSString *)stringByDecodingHTMLEntities {
// gtm_stringByUnescapingFromHTML can return self so create new string ;)
return [NSString stringWithString:[self gtm_stringByUnescapingFromHTML]];
}
// Encode all HTML entities using GTM
- (NSString *)stringByEncodingHTMLEntities {
// gtm_stringByUnescapingFromHTML can return self so create new string ;)
return [NSString stringWithString:[self gtm_stringByEscapingForAsciiHTML]];
}
// Replace newlines with
tags
- (NSString *)stringWithNewLinesAsBRs {
// Pool
NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
// Strange New lines:
// Next Line, U+0085
// Form Feed, U+000C
// Line Separator, U+2028
// Paragraph Separator, U+2029
// Scanner
NSScanner *scanner = [[NSScanner alloc] initWithString:self];
[scanner setCharactersToBeSkipped:nil];
NSMutableString *result = [[NSMutableString alloc] init];
NSString *temp;
NSCharacterSet *newLineCharacters = [NSCharacterSet characterSetWithCharactersInString:
[NSString stringWithFormat:@"\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]];
// Scan
do {
// Get non new line characters
temp = nil;
[scanner scanUpToCharactersFromSet:newLineCharacters intoString:&temp];
if (temp) [result appendString:temp];
temp = nil;
// Add
s
if ([scanner scanString:@"\r\n" intoString:nil]) {
// Combine \r\n into just 1
[result appendString:@"
"];
} else if ([scanner scanCharactersFromSet:newLineCharacters intoString:&temp]) {
// Scan other new line characters and add
s
if (temp) {
for (int i = 0; i < temp.length; i++) {
[result appendString:@"
"];
}
}
}
} while (![scanner isAtEnd]);
// Cleanup & return
[scanner release];
NSString *retString = [[NSString stringWithString:result] retain];
[result release];
// Drain
[pool drain];
// Return
return [retString autorelease];
}
// Remove newlines and white space from strong
- (NSString *)stringByRemovingNewLinesAndWhitespace {
// Pool
NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
// Strange New lines:
// Next Line, U+0085
// Form Feed, U+000C
// Line Separator, U+2028
// Paragraph Separator, U+2029
// Scanner
NSScanner *scanner = [[NSScanner alloc] initWithString:self];
[scanner setCharactersToBeSkipped:nil];
NSMutableString *result = [[NSMutableString alloc] init];
NSString *temp;
NSCharacterSet *newLineAndWhitespaceCharacters = [NSCharacterSet characterSetWithCharactersInString:
[NSString stringWithFormat:@" \t\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]];
// Scan
while (![scanner isAtEnd]) {
// Get non new line or whitespace characters
temp = nil;
[scanner scanUpToCharactersFromSet:newLineAndWhitespaceCharacters intoString:&temp];
if (temp) [result appendString:temp];
// Replace with a space
if ([scanner scanCharactersFromSet:newLineAndWhitespaceCharacters intoString:NULL]) {
if (result.length > 0 && ![scanner isAtEnd]) // Dont append space to beginning or end of result
[result appendString:@" "];
}
}
// Cleanup
[scanner release];
// Return
NSString *retString = [[NSString stringWithString:result] retain];
[result release];
// Drain
[pool drain];
// Return
return [retString autorelease];
}
// Strip HTML tags
// DEPRECIATED - Please use NSString stringByConvertingHTMLToPlainText
- (NSString *)stringByStrippingTags {
// Pool
NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
// Find first & and short-cut if we can
NSUInteger ampIndex = [self rangeOfString:@"<" options:NSLiteralSearch].location;
if (ampIndex == NSNotFound) {
return [NSString stringWithString:self]; // return copy of string as no tags found
}
// Scan and find all tags
NSScanner *scanner = [NSScanner scannerWithString:self];
[scanner setCharactersToBeSkipped:nil];
NSMutableSet *tags = [[NSMutableSet alloc] init];
NSString *tag;
do {
// Scan up to <
tag = nil;
[scanner scanUpToString:@"<" intoString:NULL];
[scanner scanUpToString:@">" intoString:&tag];
// Add to set
if (tag) {
NSString *t = [[NSString alloc] initWithFormat:@"%@>", tag];
[tags addObject:t];
[t release];
}
} while (![scanner isAtEnd]);
// Strings
NSMutableString *result = [[NSMutableString alloc] initWithString:self];
NSString *finalString;
// Replace tags
NSString *replacement;
for (NSString *t in tags) {
// Replace tag with space unless it's an inline element
replacement = @" ";
if ([t isEqualToString:@""] ||
[t isEqualToString:@""] ||
[t isEqualToString:@""] ||
[t isEqualToString:@""] ||
[t isEqualToString:@""] ||
[t isEqualToString:@""] ||
[t isEqualToString:@""] ||
[t isEqualToString:@""]) {
replacement = @"";
}
// Replace
[result replaceOccurrencesOfString:t
withString:replacement
options:NSLiteralSearch
range:NSMakeRange(0, result.length)];
}
// Remove multi-spaces and line breaks
finalString = [[result stringByRemovingNewLinesAndWhitespace] retain];
// Cleanup
[result release];
[tags release];
// Drain
[pool drain];
// Return
return [finalString autorelease];
}
/**
过滤HTML字符串中的图片指定宽度
@param width 宽度
@return result
*/
- (NSString *)htmlWebAutoImageSizeWidth:(CGFloat)width{
if (self == nil || self.length == 0) {
return @"";
}
NSString *content = [self stringByReplacingOccurrencesOfString:@""" withString:@"'"];
content = [content stringByReplacingOccurrencesOfString:@"<" withString:@"<"];
content = [content stringByReplacingOccurrencesOfString:@">" withString:@">"];
content = [content stringByReplacingOccurrencesOfString:@""" withString:@"\""];
NSString *html = content;
NSString * regExpStr = @"<(img|IMG)[^\\<\\>]*>";
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:regExpStr options:NSRegularExpressionCaseInsensitive error:nil];
NSArray *matches = [regex matchesInString:html
options:0
range:NSMakeRange(0, [html length])];
//HTML中的数组
NSMutableArray *imgArray = [NSMutableArray array];
//中的URL数组
NSMutableArray *urlArray = [NSMutableArray array];
for (NSTextCheckingResult *result in matches) {
NSRange range = result.range;
NSString *group = [html substringWithRange:range];
NSRange srange1 = [group rangeOfString:@"http"];
NSString *tempString1 = [group substringWithRange:NSMakeRange(srange1.location, group.length - srange1.location)];
NSRange srange2 = [tempString1 rangeOfString:@"\""];
NSString *tempString2 = [tempString1 substringWithRange:NSMakeRange(0,srange2.location)];
[urlArray addObject:tempString2];
[imgArray addObject:group];
}
for (int i = 0; i < imgArray.count; i++) {
NSString *string = imgArray[i];
html = [html stringByReplacingOccurrencesOfString:string withString:[NSString stringWithFormat:@"",urlArray[i],[NSDate timeIntervalSinceReferenceDate]+i,width]];
}
return html;
}
@end