// // NSString+HTML.m // MWFeedParser // // Copyright (c) 2010 Michael Waterfall // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // 1. The above copyright notice and this permission notice shall be included // in all copies or substantial portions of the Software. // // 2. This Software cannot be used to archive or collect data such as (but not // limited to) that of events, news, experiences and activities, for the // purpose of any concept relating to diary/journal keeping. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // #import "NSString+HTML.h" #import "GTMNSString+HTML.h" @implementation NSString (HTML) #pragma mark - #pragma mark Class Methods #pragma mark - #pragma mark Instance Methods // Strip HTML tags - (NSString *)stringByConvertingHTMLToPlainText { // Pool NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init]; // Character sets NSCharacterSet *stopCharacters = [NSCharacterSet characterSetWithCharactersInString:[NSString stringWithFormat:@"< \t\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]]; NSCharacterSet *newLineAndWhitespaceCharacters = [NSCharacterSet characterSetWithCharactersInString:[NSString stringWithFormat:@" \t\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]]; NSCharacterSet *tagNameCharacters = [NSCharacterSet characterSetWithCharactersInString:@"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"]; /**/ // Scan and find all tags NSMutableString *result = [[NSMutableString alloc] initWithCapacity:self.length]; NSScanner *scanner = [[NSScanner alloc] initWithString:self]; [scanner setCharactersToBeSkipped:nil]; [scanner setCaseSensitive:YES]; NSString *str = nil, *tagName = nil; BOOL dontReplaceTagWithSpace = NO; do { // Scan up to the start of a tag or whitespace if ([scanner scanUpToCharactersFromSet:stopCharacters intoString:&str]) { [result appendString:str]; str = nil; // reset } // Check if we've stopped at a tag/comment or whitespace if ([scanner scanString:@"<" intoString:NULL]) { // Stopped at a comment or tag if ([scanner scanString:@"!--" intoString:NULL]) { // Comment [scanner scanUpToString:@"-->" intoString:NULL]; [scanner scanString:@"-->" intoString:NULL]; } else { // Tag - remove and replace with space unless it's if ([scanner scanString:@"/p>" intoString:NULL]) { [result appendString:@"\n"]; [result appendString:@" "]; } if ([scanner scanString:@"/h" intoString:NULL]) { [result appendString:@"\n"]; } if ([scanner scanString:@"img" intoString:NULL]) { [scanner scanUpToString:@"src" intoString:NULL]; [scanner scanString:@"src" intoString:NULL]; [scanner scanString:@"=" intoString:NULL]; [scanner scanString:@"\'" intoString:NULL]; [scanner scanString:@"\"" intoString:NULL]; NSString *imgString; if ([scanner scanUpToCharactersFromSet:[NSCharacterSet characterSetWithCharactersInString:@"\"\'"] intoString:&imgString]) { [result appendString:[NSString stringWithFormat:@"\n%@\n",imgString]]; imgString = nil; // reset } } if ([scanner scanString:@"title" intoString:NULL]) { [scanner scanUpToString:@"" intoString:NULL]; [scanner scanString:@"" intoString:NULL]; } // a closing inline tag then dont replace with a space if ([scanner scanString:@"/" intoString:NULL]) { // Closing tag - replace with space unless it's inline tagName = nil; dontReplaceTagWithSpace = NO; if ([scanner scanCharactersFromSet:tagNameCharacters intoString:&tagName]) { tagName = [tagName lowercaseString]; dontReplaceTagWithSpace = ([tagName isEqualToString:@"a"] || [tagName isEqualToString:@"b"] || [tagName isEqualToString:@"i"] || [tagName isEqualToString:@"q"] || [tagName isEqualToString:@"span"] || [tagName isEqualToString:@"em"] || [tagName isEqualToString:@"strong"] || [tagName isEqualToString:@"cite"] || [tagName isEqualToString:@"abbr"] || [tagName isEqualToString:@"acronym"] || [tagName isEqualToString:@"label"]); } // Replace tag with string unless it was an inline if (!dontReplaceTagWithSpace && result.length > 0 && ![scanner isAtEnd]) [result appendString:@" "]; } // Scan past tag [scanner scanUpToString:@">" intoString:NULL]; [scanner scanString:@">" intoString:NULL]; } } else { // Stopped at whitespace - replace all whitespace and newlines with a space if ([scanner scanCharactersFromSet:newLineAndWhitespaceCharacters intoString:NULL]) { if (result.length > 0 && ![scanner isAtEnd]) [result appendString:@" "]; // Dont append space to beginning or end of result } } } while (![scanner isAtEnd]); // Cleanup [scanner release]; // Decode HTML entities and return NSString *retString = [[result stringByDecodingHTMLEntities] retain]; [result release]; // Drain [pool drain]; // Return return [retString autorelease]; } // Decode all HTML entities using GTM - (NSString *)stringByDecodingHTMLEntities { // gtm_stringByUnescapingFromHTML can return self so create new string ;) return [NSString stringWithString:[self gtm_stringByUnescapingFromHTML]]; } // Encode all HTML entities using GTM - (NSString *)stringByEncodingHTMLEntities { // gtm_stringByUnescapingFromHTML can return self so create new string ;) return [NSString stringWithString:[self gtm_stringByEscapingForAsciiHTML]]; } // Replace newlines with
tags - (NSString *)stringWithNewLinesAsBRs { // Pool NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init]; // Strange New lines: // Next Line, U+0085 // Form Feed, U+000C // Line Separator, U+2028 // Paragraph Separator, U+2029 // Scanner NSScanner *scanner = [[NSScanner alloc] initWithString:self]; [scanner setCharactersToBeSkipped:nil]; NSMutableString *result = [[NSMutableString alloc] init]; NSString *temp; NSCharacterSet *newLineCharacters = [NSCharacterSet characterSetWithCharactersInString: [NSString stringWithFormat:@"\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]]; // Scan do { // Get non new line characters temp = nil; [scanner scanUpToCharactersFromSet:newLineCharacters intoString:&temp]; if (temp) [result appendString:temp]; temp = nil; // Add
s if ([scanner scanString:@"\r\n" intoString:nil]) { // Combine \r\n into just 1
[result appendString:@"
"]; } else if ([scanner scanCharactersFromSet:newLineCharacters intoString:&temp]) { // Scan other new line characters and add
s if (temp) { for (int i = 0; i < temp.length; i++) { [result appendString:@"
"]; } } } } while (![scanner isAtEnd]); // Cleanup & return [scanner release]; NSString *retString = [[NSString stringWithString:result] retain]; [result release]; // Drain [pool drain]; // Return return [retString autorelease]; } // Remove newlines and white space from strong - (NSString *)stringByRemovingNewLinesAndWhitespace { // Pool NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init]; // Strange New lines: // Next Line, U+0085 // Form Feed, U+000C // Line Separator, U+2028 // Paragraph Separator, U+2029 // Scanner NSScanner *scanner = [[NSScanner alloc] initWithString:self]; [scanner setCharactersToBeSkipped:nil]; NSMutableString *result = [[NSMutableString alloc] init]; NSString *temp; NSCharacterSet *newLineAndWhitespaceCharacters = [NSCharacterSet characterSetWithCharactersInString: [NSString stringWithFormat:@" \t\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]]; // Scan while (![scanner isAtEnd]) { // Get non new line or whitespace characters temp = nil; [scanner scanUpToCharactersFromSet:newLineAndWhitespaceCharacters intoString:&temp]; if (temp) [result appendString:temp]; // Replace with a space if ([scanner scanCharactersFromSet:newLineAndWhitespaceCharacters intoString:NULL]) { if (result.length > 0 && ![scanner isAtEnd]) // Dont append space to beginning or end of result [result appendString:@" "]; } } // Cleanup [scanner release]; // Return NSString *retString = [[NSString stringWithString:result] retain]; [result release]; // Drain [pool drain]; // Return return [retString autorelease]; } // Strip HTML tags // DEPRECIATED - Please use NSString stringByConvertingHTMLToPlainText - (NSString *)stringByStrippingTags { // Pool NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init]; // Find first & and short-cut if we can NSUInteger ampIndex = [self rangeOfString:@"<" options:NSLiteralSearch].location; if (ampIndex == NSNotFound) { return [NSString stringWithString:self]; // return copy of string as no tags found } // Scan and find all tags NSScanner *scanner = [NSScanner scannerWithString:self]; [scanner setCharactersToBeSkipped:nil]; NSMutableSet *tags = [[NSMutableSet alloc] init]; NSString *tag; do { // Scan up to < tag = nil; [scanner scanUpToString:@"<" intoString:NULL]; [scanner scanUpToString:@">" intoString:&tag]; // Add to set if (tag) { NSString *t = [[NSString alloc] initWithFormat:@"%@>", tag]; [tags addObject:t]; [t release]; } } while (![scanner isAtEnd]); // Strings NSMutableString *result = [[NSMutableString alloc] initWithString:self]; NSString *finalString; // Replace tags NSString *replacement; for (NSString *t in tags) { // Replace tag with space unless it's an inline element replacement = @" "; if ([t isEqualToString:@""] || [t isEqualToString:@""] || [t isEqualToString:@""] || [t isEqualToString:@""] || [t isEqualToString:@""] || [t isEqualToString:@""] || [t isEqualToString:@""] || [t isEqualToString:@""]) { replacement = @""; } // Replace [result replaceOccurrencesOfString:t withString:replacement options:NSLiteralSearch range:NSMakeRange(0, result.length)]; } // Remove multi-spaces and line breaks finalString = [[result stringByRemovingNewLinesAndWhitespace] retain]; // Cleanup [result release]; [tags release]; // Drain [pool drain]; // Return return [finalString autorelease]; } /** 过滤HTML字符串中的图片指定宽度 @param width 宽度 @return result */ - (NSString *)htmlWebAutoImageSizeWidth:(CGFloat)width{ if (self == nil || self.length == 0) { return @""; } NSString *content = [self stringByReplacingOccurrencesOfString:@"&quot" withString:@"'"]; content = [content stringByReplacingOccurrencesOfString:@"<" withString:@"<"]; content = [content stringByReplacingOccurrencesOfString:@">" withString:@">"]; content = [content stringByReplacingOccurrencesOfString:@""" withString:@"\""]; NSString *html = content; NSString * regExpStr = @"<(img|IMG)[^\\<\\>]*>"; NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:regExpStr options:NSRegularExpressionCaseInsensitive error:nil]; NSArray *matches = [regex matchesInString:html options:0 range:NSMakeRange(0, [html length])]; //HTML中的数组 NSMutableArray *imgArray = [NSMutableArray array]; //中的URL数组 NSMutableArray *urlArray = [NSMutableArray array]; for (NSTextCheckingResult *result in matches) { NSRange range = result.range; NSString *group = [html substringWithRange:range]; NSRange srange1 = [group rangeOfString:@"http"]; NSString *tempString1 = [group substringWithRange:NSMakeRange(srange1.location, group.length - srange1.location)]; NSRange srange2 = [tempString1 rangeOfString:@"\""]; NSString *tempString2 = [tempString1 substringWithRange:NSMakeRange(0,srange2.location)]; [urlArray addObject:tempString2]; [imgArray addObject:group]; } for (int i = 0; i < imgArray.count; i++) { NSString *string = imgArray[i]; html = [html stringByReplacingOccurrencesOfString:string withString:[NSString stringWithFormat:@"\"%lld\"",urlArray[i],[NSDate timeIntervalSinceReferenceDate]+i,width]]; } return html; } @end