RegexUtils.cs 3.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. using System;
  2. using System.Collections.Generic;
  3. using System.Text.RegularExpressions;
  4. namespace Tede.Data.Tests.Utils
  5. {
  6. public static class RegexUtils
  7. {
  8. /*
  9. * 通用:.*?
  10. * 所有链接:<a\s*.*?href=(?:"(?<url>[^"]*)"|'(?<url>[^']*)'|(?<url>\S+)).*?>
  11. * */
  12. private static RegexOptions Options = ((RegexOptions.Singleline | RegexOptions.IgnoreCase) | RegexOptions.IgnorePatternWhitespace);
  13. public static List<string> GetOriginalImageSrcs(string html)
  14. {
  15. const string regex = "(img|input)[^><]*\\s+src\\s*=\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>[^>\\s]*))";
  16. return GetContents("url", regex, html);
  17. }
  18. public static List<string> GetOriginalLinkHrefs(string html)
  19. {
  20. const string regex = "a[^><]*\\s+href\\s*=\\s*(?:\"(?<url>[^\"]*)\"|'(?<url>[^']*)'|(?<url>[^>\\s]*))";
  21. return GetContents("url", regex, html);
  22. }
  23. public static List<string> GetTagInnerContents(string tagName, string html)
  24. {
  25. string regex = $"<{tagName}\\s+[^><]*>\\s*(?<content>[\\s\\S]+?)\\s*</{tagName}>";
  26. return GetContents("content", regex, html);
  27. }
  28. public static string GetInnerContent(string tagName, string html)
  29. {
  30. string regex = $"<{tagName}[^><]*>(?<content>[\\s\\S]+?)</{tagName}>";
  31. return GetContent("content", regex, html);
  32. }
  33. public static string GetContent(string groupName, string regex, string html)
  34. {
  35. var content = string.Empty;
  36. if (string.IsNullOrEmpty(regex)) return content;
  37. if (regex.IndexOf("<" + groupName + ">", StringComparison.Ordinal) == -1)
  38. {
  39. return regex;
  40. }
  41. var reg = new Regex(regex, Options);
  42. var match = reg.Match(html);
  43. if (match.Success)
  44. {
  45. content = match.Groups[groupName].Value;
  46. }
  47. return content;
  48. }
  49. public static string Replace(string regex, string input, string replacement)
  50. {
  51. if (string.IsNullOrEmpty(input)) return input;
  52. var reg = new Regex(regex, Options);
  53. return reg.Replace(input, replacement);
  54. }
  55. public static bool IsMatch(string regex, string input)
  56. {
  57. var reg = new Regex(regex, Options);
  58. return reg.IsMatch(input);
  59. }
  60. public static List<string> GetContents(string groupName, string regex, string html)
  61. {
  62. if (string.IsNullOrEmpty(regex)) return new List<string>();
  63. var list = new List<string>();
  64. var reg = new Regex(regex, Options);
  65. for (var match = reg.Match(html); match.Success; match = match.NextMatch())
  66. {
  67. var theValue = match.Groups[groupName].Value;
  68. if (!list.Contains(theValue))
  69. {
  70. list.Add(theValue);
  71. }
  72. }
  73. return list;
  74. }
  75. }
  76. }