NSString+HTML.m 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414
  1. //
  2. // NSString+HTML.m
  3. // MWFeedParser
  4. //
  5. // Copyright (c) 2010 Michael Waterfall
  6. //
  7. // Permission is hereby granted, free of charge, to any person obtaining a copy
  8. // of this software and associated documentation files (the "Software"), to deal
  9. // in the Software without restriction, including without limitation the rights
  10. // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11. // copies of the Software, and to permit persons to whom the Software is
  12. // furnished to do so, subject to the following conditions:
  13. //
  14. // 1. The above copyright notice and this permission notice shall be included
  15. // in all copies or substantial portions of the Software.
  16. //
  17. // 2. This Software cannot be used to archive or collect data such as (but not
  18. // limited to) that of events, news, experiences and activities, for the
  19. // purpose of any concept relating to diary/journal keeping.
  20. //
  21. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  22. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  23. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  24. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  25. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  26. // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  27. // THE SOFTWARE.
  28. //
  29. #import "NSString+HTML.h"
  30. #import "GTMNSString+HTML.h"
  31. @implementation NSString (HTML)
  32. #pragma mark -
  33. #pragma mark Class Methods
  34. #pragma mark -
  35. #pragma mark Instance Methods
  36. // Strip HTML tags
  37. - (NSString *)stringByConvertingHTMLToPlainText {
  38. // Pool
  39. NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
  40. // Character sets
  41. NSCharacterSet *stopCharacters = [NSCharacterSet characterSetWithCharactersInString:[NSString stringWithFormat:@"< \t\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]];
  42. NSCharacterSet *newLineAndWhitespaceCharacters = [NSCharacterSet characterSetWithCharactersInString:[NSString stringWithFormat:@" \t\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]];
  43. NSCharacterSet *tagNameCharacters = [NSCharacterSet characterSetWithCharactersInString:@"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"]; /**/
  44. // Scan and find all tags
  45. NSMutableString *result = [[NSMutableString alloc] initWithCapacity:self.length];
  46. NSScanner *scanner = [[NSScanner alloc] initWithString:self];
  47. [scanner setCharactersToBeSkipped:nil];
  48. [scanner setCaseSensitive:YES];
  49. NSString *str = nil, *tagName = nil;
  50. BOOL dontReplaceTagWithSpace = NO;
  51. do {
  52. // Scan up to the start of a tag or whitespace
  53. if ([scanner scanUpToCharactersFromSet:stopCharacters intoString:&str]) {
  54. [result appendString:str];
  55. str = nil; // reset
  56. }
  57. // Check if we've stopped at a tag/comment or whitespace
  58. if ([scanner scanString:@"<" intoString:NULL]) {
  59. // Stopped at a comment or tag
  60. if ([scanner scanString:@"!--" intoString:NULL]) {
  61. // Comment
  62. [scanner scanUpToString:@"-->" intoString:NULL];
  63. [scanner scanString:@"-->" intoString:NULL];
  64. } else {
  65. // Tag - remove and replace with space unless it's
  66. if ([scanner scanString:@"/p>" intoString:NULL]) {
  67. [result appendString:@"\n"];
  68. [result appendString:@" "];
  69. }
  70. if ([scanner scanString:@"/h" intoString:NULL]) {
  71. [result appendString:@"\n"];
  72. }
  73. if ([scanner scanString:@"img" intoString:NULL]) {
  74. [scanner scanUpToString:@"src" intoString:NULL];
  75. [scanner scanString:@"src" intoString:NULL];
  76. [scanner scanString:@"=" intoString:NULL];
  77. [scanner scanString:@"\'" intoString:NULL];
  78. [scanner scanString:@"\"" intoString:NULL];
  79. NSString *imgString;
  80. if ([scanner scanUpToCharactersFromSet:[NSCharacterSet characterSetWithCharactersInString:@"\"\'"] intoString:&imgString]) {
  81. [result appendString:[NSString stringWithFormat:@"\n<img>%@</img>\n",imgString]];
  82. imgString = nil; // reset
  83. }
  84. }
  85. if ([scanner scanString:@"title" intoString:NULL]) {
  86. [scanner scanUpToString:@"</title>" intoString:NULL];
  87. [scanner scanString:@"</title>" intoString:NULL];
  88. }
  89. // a closing inline tag then dont replace with a space
  90. if ([scanner scanString:@"/" intoString:NULL]) {
  91. // Closing tag - replace with space unless it's inline
  92. tagName = nil; dontReplaceTagWithSpace = NO;
  93. if ([scanner scanCharactersFromSet:tagNameCharacters intoString:&tagName]) {
  94. tagName = [tagName lowercaseString];
  95. dontReplaceTagWithSpace = ([tagName isEqualToString:@"a"] ||
  96. [tagName isEqualToString:@"b"] ||
  97. [tagName isEqualToString:@"i"] ||
  98. [tagName isEqualToString:@"q"] ||
  99. [tagName isEqualToString:@"span"] ||
  100. [tagName isEqualToString:@"em"] ||
  101. [tagName isEqualToString:@"strong"] ||
  102. [tagName isEqualToString:@"cite"] ||
  103. [tagName isEqualToString:@"abbr"] ||
  104. [tagName isEqualToString:@"acronym"] ||
  105. [tagName isEqualToString:@"label"]);
  106. }
  107. // Replace tag with string unless it was an inline
  108. if (!dontReplaceTagWithSpace && result.length > 0 && ![scanner isAtEnd]) [result appendString:@" "];
  109. }
  110. // Scan past tag
  111. [scanner scanUpToString:@">" intoString:NULL];
  112. [scanner scanString:@">" intoString:NULL];
  113. }
  114. } else {
  115. // Stopped at whitespace - replace all whitespace and newlines with a space
  116. if ([scanner scanCharactersFromSet:newLineAndWhitespaceCharacters intoString:NULL]) {
  117. if (result.length > 0 && ![scanner isAtEnd]) [result appendString:@" "]; // Dont append space to beginning or end of result
  118. }
  119. }
  120. } while (![scanner isAtEnd]);
  121. // Cleanup
  122. [scanner release];
  123. // Decode HTML entities and return
  124. NSString *retString = [[result stringByDecodingHTMLEntities] retain];
  125. [result release];
  126. // Drain
  127. [pool drain];
  128. // Return
  129. return [retString autorelease];
  130. }
  131. // Decode all HTML entities using GTM
  132. - (NSString *)stringByDecodingHTMLEntities {
  133. // gtm_stringByUnescapingFromHTML can return self so create new string ;)
  134. return [NSString stringWithString:[self gtm_stringByUnescapingFromHTML]];
  135. }
  136. // Encode all HTML entities using GTM
  137. - (NSString *)stringByEncodingHTMLEntities {
  138. // gtm_stringByUnescapingFromHTML can return self so create new string ;)
  139. return [NSString stringWithString:[self gtm_stringByEscapingForAsciiHTML]];
  140. }
  141. // Replace newlines with <br /> tags
  142. - (NSString *)stringWithNewLinesAsBRs {
  143. // Pool
  144. NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
  145. // Strange New lines:
  146. // Next Line, U+0085
  147. // Form Feed, U+000C
  148. // Line Separator, U+2028
  149. // Paragraph Separator, U+2029
  150. // Scanner
  151. NSScanner *scanner = [[NSScanner alloc] initWithString:self];
  152. [scanner setCharactersToBeSkipped:nil];
  153. NSMutableString *result = [[NSMutableString alloc] init];
  154. NSString *temp;
  155. NSCharacterSet *newLineCharacters = [NSCharacterSet characterSetWithCharactersInString:
  156. [NSString stringWithFormat:@"\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]];
  157. // Scan
  158. do {
  159. // Get non new line characters
  160. temp = nil;
  161. [scanner scanUpToCharactersFromSet:newLineCharacters intoString:&temp];
  162. if (temp) [result appendString:temp];
  163. temp = nil;
  164. // Add <br /> s
  165. if ([scanner scanString:@"\r\n" intoString:nil]) {
  166. // Combine \r\n into just 1 <br />
  167. [result appendString:@"<br />"];
  168. } else if ([scanner scanCharactersFromSet:newLineCharacters intoString:&temp]) {
  169. // Scan other new line characters and add <br /> s
  170. if (temp) {
  171. for (int i = 0; i < temp.length; i++) {
  172. [result appendString:@"<br />"];
  173. }
  174. }
  175. }
  176. } while (![scanner isAtEnd]);
  177. // Cleanup & return
  178. [scanner release];
  179. NSString *retString = [[NSString stringWithString:result] retain];
  180. [result release];
  181. // Drain
  182. [pool drain];
  183. // Return
  184. return [retString autorelease];
  185. }
  186. // Remove newlines and white space from strong
  187. - (NSString *)stringByRemovingNewLinesAndWhitespace {
  188. // Pool
  189. NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
  190. // Strange New lines:
  191. // Next Line, U+0085
  192. // Form Feed, U+000C
  193. // Line Separator, U+2028
  194. // Paragraph Separator, U+2029
  195. // Scanner
  196. NSScanner *scanner = [[NSScanner alloc] initWithString:self];
  197. [scanner setCharactersToBeSkipped:nil];
  198. NSMutableString *result = [[NSMutableString alloc] init];
  199. NSString *temp;
  200. NSCharacterSet *newLineAndWhitespaceCharacters = [NSCharacterSet characterSetWithCharactersInString:
  201. [NSString stringWithFormat:@" \t\n\r%C%C%C%C", 0x0085, 0x000C, 0x2028, 0x2029]];
  202. // Scan
  203. while (![scanner isAtEnd]) {
  204. // Get non new line or whitespace characters
  205. temp = nil;
  206. [scanner scanUpToCharactersFromSet:newLineAndWhitespaceCharacters intoString:&temp];
  207. if (temp) [result appendString:temp];
  208. // Replace with a space
  209. if ([scanner scanCharactersFromSet:newLineAndWhitespaceCharacters intoString:NULL]) {
  210. if (result.length > 0 && ![scanner isAtEnd]) // Dont append space to beginning or end of result
  211. [result appendString:@" "];
  212. }
  213. }
  214. // Cleanup
  215. [scanner release];
  216. // Return
  217. NSString *retString = [[NSString stringWithString:result] retain];
  218. [result release];
  219. // Drain
  220. [pool drain];
  221. // Return
  222. return [retString autorelease];
  223. }
  224. // Strip HTML tags
  225. // DEPRECIATED - Please use NSString stringByConvertingHTMLToPlainText
  226. - (NSString *)stringByStrippingTags {
  227. // Pool
  228. NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
  229. // Find first & and short-cut if we can
  230. NSUInteger ampIndex = [self rangeOfString:@"<" options:NSLiteralSearch].location;
  231. if (ampIndex == NSNotFound) {
  232. return [NSString stringWithString:self]; // return copy of string as no tags found
  233. }
  234. // Scan and find all tags
  235. NSScanner *scanner = [NSScanner scannerWithString:self];
  236. [scanner setCharactersToBeSkipped:nil];
  237. NSMutableSet *tags = [[NSMutableSet alloc] init];
  238. NSString *tag;
  239. do {
  240. // Scan up to <
  241. tag = nil;
  242. [scanner scanUpToString:@"<" intoString:NULL];
  243. [scanner scanUpToString:@">" intoString:&tag];
  244. // Add to set
  245. if (tag) {
  246. NSString *t = [[NSString alloc] initWithFormat:@"%@>", tag];
  247. [tags addObject:t];
  248. [t release];
  249. }
  250. } while (![scanner isAtEnd]);
  251. // Strings
  252. NSMutableString *result = [[NSMutableString alloc] initWithString:self];
  253. NSString *finalString;
  254. // Replace tags
  255. NSString *replacement;
  256. for (NSString *t in tags) {
  257. // Replace tag with space unless it's an inline element
  258. replacement = @" ";
  259. if ([t isEqualToString:@"<a>"] ||
  260. [t isEqualToString:@"</a>"] ||
  261. [t isEqualToString:@"<span>"] ||
  262. [t isEqualToString:@"</span>"] ||
  263. [t isEqualToString:@"<strong>"] ||
  264. [t isEqualToString:@"</strong>"] ||
  265. [t isEqualToString:@"<em>"] ||
  266. [t isEqualToString:@"</em>"]) {
  267. replacement = @"";
  268. }
  269. // Replace
  270. [result replaceOccurrencesOfString:t
  271. withString:replacement
  272. options:NSLiteralSearch
  273. range:NSMakeRange(0, result.length)];
  274. }
  275. // Remove multi-spaces and line breaks
  276. finalString = [[result stringByRemovingNewLinesAndWhitespace] retain];
  277. // Cleanup
  278. [result release];
  279. [tags release];
  280. // Drain
  281. [pool drain];
  282. // Return
  283. return [finalString autorelease];
  284. }
  285. /**
  286. 过滤HTML字符串中的图片指定宽度
  287. @param width 宽度
  288. @return result
  289. */
  290. - (NSString *)htmlWebAutoImageSizeWidth:(CGFloat)width{
  291. if (self == nil || self.length == 0) {
  292. return @"";
  293. }
  294. NSString *content = [self stringByReplacingOccurrencesOfString:@"&amp;quot" withString:@"'"];
  295. content = [content stringByReplacingOccurrencesOfString:@"&lt;" withString:@"<"];
  296. content = [content stringByReplacingOccurrencesOfString:@"&gt;" withString:@">"];
  297. content = [content stringByReplacingOccurrencesOfString:@"&quot;" withString:@"\""];
  298. NSString *html = content;
  299. NSString * regExpStr = @"<(img|IMG)[^\\<\\>]*>";
  300. NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:regExpStr options:NSRegularExpressionCaseInsensitive error:nil];
  301. NSArray *matches = [regex matchesInString:html
  302. options:0
  303. range:NSMakeRange(0, [html length])];
  304. //HTML中的<img ...... />数组
  305. NSMutableArray *imgArray = [NSMutableArray array];
  306. //<img src="URL"/>中的URL数组
  307. NSMutableArray *urlArray = [NSMutableArray array];
  308. for (NSTextCheckingResult *result in matches) {
  309. NSRange range = result.range;
  310. NSString *group = [html substringWithRange:range];
  311. NSRange srange1 = [group rangeOfString:@"http"];
  312. NSString *tempString1 = [group substringWithRange:NSMakeRange(srange1.location, group.length - srange1.location)];
  313. NSRange srange2 = [tempString1 rangeOfString:@"\""];
  314. NSString *tempString2 = [tempString1 substringWithRange:NSMakeRange(0,srange2.location)];
  315. [urlArray addObject:tempString2];
  316. [imgArray addObject:group];
  317. }
  318. for (int i = 0; i < imgArray.count; i++) {
  319. NSString *string = imgArray[i];
  320. html = [html stringByReplacingOccurrencesOfString:string withString:[NSString stringWithFormat:@"<img src=\"%@\" title=\"\" alt=\"%lld\" width=\"%f\" height=\"auto\">",urlArray[i],[NSDate timeIntervalSinceReferenceDate]+i,width]];
  321. }
  322. return html;
  323. }
  324. @end