Skip to content
索引

源码

ts
import { MultiLineStream } from './MultiLineStream';

/**
 * HTML tag name (explaining the regex)
 *
 * This regex is for the name of the html tag
 * E.g. we want to match "div" inside "<div>"
 *
 * ^  ### start
 * [:\w]  ### ":" or character or digit
 * ((?![>\/])[\S])  ### everything except closing brackets
 */
const htmlTagNameRE = /^[!:\w\$]((?![>\/])[\S])*/;

/**
 * Empty html tag, e.g. `< ></>`
 */
const htmlTagNameEmptyRE = /^\s+/;

/**
 * Html attribute name (explaining the regex)
 *
 * This regex is for html attribute names,
 * E.g. we want to match "class" in "<div class="center">"
 *
 * ^  ### start
 *   [^\s"'>/=]*  ### any anything that isn't whitespace, ", ', >, / or =
 */
const htmlAttributeNameRE = /^[^\s"'>/=]*/;

/**
 * Html attribute value (explaining the regex)
 *
 * ^  ### start
 *   [^\s"'`=<>/]+  ### no whitespace, double quotes, single quotes, back quotes, "=", "<", ">" and "/"
 */
const htmlAttributeValueRE = /^[^\s"'`=<>/]+/;

export const enum TokenTypeFast {
  StartCommentTag, // "<--" part of "<!-- this is a comment -->"
  Comment, // " this is a comment " part of "<!-- this is a comment -->"
  EndCommentTag, // "-->" part of "<!-- this is a comment -->"
  StartTagOpen, // "<" part of "<html>"
  StartTagClose, // ">" part of "<html>"
  StartTagSelfClose, // "/>" part of "<input />"
  StartTag, // "input" part of "<input>"
  EndTagOpen, // "<" part of "</html>"
  EndTagClose, // ">" part of "</html>"
  EndTag, // "html" part of </html>
  AttributeName, // "class" part of "<div class="center">"
  AttributeValue, // "center" part of "<div class="center">"
  Content, // "this is text" part of "<p>this is text</p>"
  EOS, // end of stream
  DelimiterAssign, // "=" part of "<div class="center">
  Unknown, // anything that doesn't make sense, e.g. ";" in "i <length;"
  WhiteSpace
}

export interface ScannerFast {
  readonly scan: () => TokenTypeFast;
  readonly getTokenText: () => string;
  readonly stream: MultiLineStream;
  state: ScannerStateFast;
}

export const enum ScannerStateFast {
  WithinContent,
  AfterOpeningStartTag,
  AfterOpeningEndTag,
  WithinStartTag,
  WithinEndTag,
  WithinComment,
  AfterAttributeName,
  BeforeAttributeValue
}

export function createScannerFast({
  input,
  initialOffset,
  initialState,
  matchingTagPairs
}: {
  input: string;
  initialOffset: number;
  initialState: ScannerStateFast;
  matchingTagPairs: readonly [string, string][];
}): ScannerFast {
  const stream = new MultiLineStream(input, initialOffset, matchingTagPairs);
  let state: ScannerStateFast = initialState;
  let tokenOffset: number;
  /**
   * Whether or not a space is after the starting tag name.
   * E.g. "<div >" but not "<div''>" and "<div class="center" >" but not "<div class="center">""
   * This is used to determine whether the following characters are attributes or just invalid
   */

  function nextElementName(): string | undefined {
    let result = stream.advanceIfRegExp(htmlTagNameRE);
    if (result === undefined) {
      if (stream.advanceIfRegExp(htmlTagNameEmptyRE)) {
        result = '';
      }
    }
    return result;
  }

  let lastTagName: string | undefined;
  // @ts-ignore
  function scan(): TokenTypeFast {
    tokenOffset = stream.position;
    if (stream.eos()) {
      return TokenTypeFast.EOS;
    }
    switch (state) {
      case ScannerStateFast.AfterOpeningEndTag:
        const tagName = nextElementName();
        if (tagName) {
          state = ScannerStateFast.WithinEndTag;
          return TokenTypeFast.EndTag;
        } else if (stream.peekRight(0) === '>') {
          state = ScannerStateFast.WithinEndTag;
          return TokenTypeFast.EndTag;
        }
        return TokenTypeFast.Unknown;

      case ScannerStateFast.AfterOpeningStartTag:
        lastTagName = nextElementName();
        if (lastTagName !== undefined) {
          if (lastTagName === '') {
            tokenOffset = stream.position;
          }
          state = ScannerStateFast.WithinStartTag;
          return TokenTypeFast.StartTag;
        }
        // this is a tag like "<>"
        if (stream.peekRight() === '>') {
          state = ScannerStateFast.WithinStartTag;
          return TokenTypeFast.StartTag;
        }
        // At this point there is no tag name sign after the opening tag "<"
        // E.g. "< div"
        // So we just assume that it is text
        state = ScannerStateFast.WithinContent;
        return scan();
      default:
        break;
    }
  }

  return {
    scan,
    stream,
    getTokenText() {
      return stream.getSource().slice(tokenOffset, stream.position);
    },
    set state(newState: any) {
      state = newState;
    }
  };
}
import { MultiLineStream } from './MultiLineStream';

/**
 * HTML tag name (explaining the regex)
 *
 * This regex is for the name of the html tag
 * E.g. we want to match "div" inside "<div>"
 *
 * ^  ### start
 * [:\w]  ### ":" or character or digit
 * ((?![>\/])[\S])  ### everything except closing brackets
 */
const htmlTagNameRE = /^[!:\w\$]((?![>\/])[\S])*/;

/**
 * Empty html tag, e.g. `< ></>`
 */
const htmlTagNameEmptyRE = /^\s+/;

/**
 * Html attribute name (explaining the regex)
 *
 * This regex is for html attribute names,
 * E.g. we want to match "class" in "<div class="center">"
 *
 * ^  ### start
 *   [^\s"'>/=]*  ### any anything that isn't whitespace, ", ', >, / or =
 */
const htmlAttributeNameRE = /^[^\s"'>/=]*/;

/**
 * Html attribute value (explaining the regex)
 *
 * ^  ### start
 *   [^\s"'`=<>/]+  ### no whitespace, double quotes, single quotes, back quotes, "=", "<", ">" and "/"
 */
const htmlAttributeValueRE = /^[^\s"'`=<>/]+/;

export const enum TokenTypeFast {
  StartCommentTag, // "<--" part of "<!-- this is a comment -->"
  Comment, // " this is a comment " part of "<!-- this is a comment -->"
  EndCommentTag, // "-->" part of "<!-- this is a comment -->"
  StartTagOpen, // "<" part of "<html>"
  StartTagClose, // ">" part of "<html>"
  StartTagSelfClose, // "/>" part of "<input />"
  StartTag, // "input" part of "<input>"
  EndTagOpen, // "<" part of "</html>"
  EndTagClose, // ">" part of "</html>"
  EndTag, // "html" part of </html>
  AttributeName, // "class" part of "<div class="center">"
  AttributeValue, // "center" part of "<div class="center">"
  Content, // "this is text" part of "<p>this is text</p>"
  EOS, // end of stream
  DelimiterAssign, // "=" part of "<div class="center">
  Unknown, // anything that doesn't make sense, e.g. ";" in "i <length;"
  WhiteSpace
}

export interface ScannerFast {
  readonly scan: () => TokenTypeFast;
  readonly getTokenText: () => string;
  readonly stream: MultiLineStream;
  state: ScannerStateFast;
}

export const enum ScannerStateFast {
  WithinContent,
  AfterOpeningStartTag,
  AfterOpeningEndTag,
  WithinStartTag,
  WithinEndTag,
  WithinComment,
  AfterAttributeName,
  BeforeAttributeValue
}

export function createScannerFast({
  input,
  initialOffset,
  initialState,
  matchingTagPairs
}: {
  input: string;
  initialOffset: number;
  initialState: ScannerStateFast;
  matchingTagPairs: readonly [string, string][];
}): ScannerFast {
  const stream = new MultiLineStream(input, initialOffset, matchingTagPairs);
  let state: ScannerStateFast = initialState;
  let tokenOffset: number;
  /**
   * Whether or not a space is after the starting tag name.
   * E.g. "<div >" but not "<div''>" and "<div class="center" >" but not "<div class="center">""
   * This is used to determine whether the following characters are attributes or just invalid
   */

  function nextElementName(): string | undefined {
    let result = stream.advanceIfRegExp(htmlTagNameRE);
    if (result === undefined) {
      if (stream.advanceIfRegExp(htmlTagNameEmptyRE)) {
        result = '';
      }
    }
    return result;
  }

  let lastTagName: string | undefined;
  // @ts-ignore
  function scan(): TokenTypeFast {
    tokenOffset = stream.position;
    if (stream.eos()) {
      return TokenTypeFast.EOS;
    }
    switch (state) {
      case ScannerStateFast.AfterOpeningEndTag:
        const tagName = nextElementName();
        if (tagName) {
          state = ScannerStateFast.WithinEndTag;
          return TokenTypeFast.EndTag;
        } else if (stream.peekRight(0) === '>') {
          state = ScannerStateFast.WithinEndTag;
          return TokenTypeFast.EndTag;
        }
        return TokenTypeFast.Unknown;

      case ScannerStateFast.AfterOpeningStartTag:
        lastTagName = nextElementName();
        if (lastTagName !== undefined) {
          if (lastTagName === '') {
            tokenOffset = stream.position;
          }
          state = ScannerStateFast.WithinStartTag;
          return TokenTypeFast.StartTag;
        }
        // this is a tag like "<>"
        if (stream.peekRight() === '>') {
          state = ScannerStateFast.WithinStartTag;
          return TokenTypeFast.StartTag;
        }
        // At this point there is no tag name sign after the opening tag "<"
        // E.g. "< div"
        // So we just assume that it is text
        state = ScannerStateFast.WithinContent;
        return scan();
      default:
        break;
    }
  }

  return {
    scan,
    stream,
    getTokenText() {
      return stream.getSource().slice(tokenOffset, stream.position);
    },
    set state(newState: any) {
      state = newState;
    }
  };
}

htmlTagNameRE:用于匹配 HTML 标签的名称的正则表达式

ts
/**
 * HTML 标签名称(解释正则表达式)
 *
 * 此正则表达式用于匹配 HTML 标签的名称,例如我们希望在 "<div>" 中匹配 "div"。
 *
 * ^  ### 开始位置
 * [:\w]  ### ":" 或 字母或数字
 * ((?![>\/])[\S])*  ### 除了闭合括号之外的任意字符
 */
const htmlTagNameRE = /^[!:\w\$]((?![>\/])[\S])*/;
/**
 * HTML 标签名称(解释正则表达式)
 *
 * 此正则表达式用于匹配 HTML 标签的名称,例如我们希望在 "<div>" 中匹配 "div"。
 *
 * ^  ### 开始位置
 * [:\w]  ### ":" 或 字母或数字
 * ((?![>\/])[\S])*  ### 除了闭合括号之外的任意字符
 */
const htmlTagNameRE = /^[!:\w\$]((?![>\/])[\S])*/;

htmlTagNameEmptyRE:匹配空标签的正则表达式

ts
/**
 * 空的 HTML 标签,例如 `< ></>`
 */
const htmlTagNameEmptyRE = /^\s+/;
/**
 * 空的 HTML 标签,例如 `< ></>`
 */
const htmlTagNameEmptyRE = /^\s+/;

创建一个快速的 HTML 扫描器,用于识别和扫描 HTML 文本中的不同记号

这段代码是一个快速的 HTML 扫描器,用于识别和扫描 HTML 文本中的不同记号,如标签、属性、注释和内容文本等。快速扫描器利用了正则表达式进行快速匹配,从而提高了扫描的效率。其中,ScannerFastScannerStateFast 是相关的接口和枚举类型,用于描述扫描器的状态和行为。该扫描器适用于对 HTML 文本进行快速的记号识别和解析,以便在后续的语法解析和处理中使用。

ts
/**
 * 创建快速扫描器
 *
 * @param {object} params - 创建扫描器所需的参数
 * @param {string} params.input - 输入文本
 * @param {number} params.initialOffset - 初始偏移量
 * @param {ScannerStateFast} params.initialState - 初始扫描状态
 * @param {readonly [string, string][]} params.matchingTagPairs - 标签对列表
 * @returns {ScannerFast} - 返回快速扫描器对象
 */
export function createScannerFast({
  input,
  initialOffset,
  initialState,
  matchingTagPairs
}: {
  input: string;
  initialOffset: number;
  initialState: ScannerStateFast;
  matchingTagPairs: readonly [string, string][];
}): ScannerFast {
  // 创建 MultiLineStream 实例,用于处理多行文本流
  const stream = new MultiLineStream(input, initialOffset, matchingTagPairs);
  let state: ScannerStateFast = initialState;
  let tokenOffset: number;

  /**
   * 获取下一个元素名称(标签名或属性名)
   *
   * @returns {string | undefined} - 返回下一个元素名称,如果没有,则返回 undefined
   */
  function nextElementName(): string | undefined {
    let result = stream.advanceIfRegExp(htmlTagNameRE);
    if (result === undefined) {
      if (stream.advanceIfRegExp(htmlTagNameEmptyRE)) {
        result = '';
      }
    }
    return result;
  }

  let lastTagName: string | undefined;
  // 定义 scan 函数,用于扫描并返回记号类型
  function scan(): TokenTypeFast {
    tokenOffset = stream.position;
    if (stream.eos()) {
      return TokenTypeFast.EOS;
    }
    switch (state) {
      case ScannerStateFast.AfterOpeningEndTag:
        const tagName = nextElementName();
        if (tagName) {
          state = ScannerStateFast.WithinEndTag;
          return TokenTypeFast.EndTag;
        } else if (stream.peekRight(0) === '>') {
          state = ScannerStateFast.WithinEndTag;
          return TokenTypeFast.EndTag;
        }
        return TokenTypeFast.Unknown;

      case ScannerStateFast.AfterOpeningStartTag:
        lastTagName = nextElementName();
        if (lastTagName !== undefined) {
          if (lastTagName === '') {
            tokenOffset = stream.position;
          }
          state = ScannerStateFast.WithinStartTag;
          return TokenTypeFast.StartTag;
        }
        // this is a tag like "<>"
        if (stream.peekRight() === '>') {
          state = ScannerStateFast.WithinStartTag;
          return TokenTypeFast.StartTag;
        }
        // At this point there is no tag name sign after the opening tag "<"
        // E.g. "< div"
        // So we just assume that it is text
        state = ScannerStateFast.WithinContent;
        return scan();
      default:
        break;
    }
  }

  // 返回 ScannerFast 对象
  return {
    scan,
    stream,
    getTokenText() {
      return stream.getSource().slice(tokenOffset, stream.position);
    },
    set state(newState: any) {
      state = newState;
    }
  };
}
/**
 * 创建快速扫描器
 *
 * @param {object} params - 创建扫描器所需的参数
 * @param {string} params.input - 输入文本
 * @param {number} params.initialOffset - 初始偏移量
 * @param {ScannerStateFast} params.initialState - 初始扫描状态
 * @param {readonly [string, string][]} params.matchingTagPairs - 标签对列表
 * @returns {ScannerFast} - 返回快速扫描器对象
 */
export function createScannerFast({
  input,
  initialOffset,
  initialState,
  matchingTagPairs
}: {
  input: string;
  initialOffset: number;
  initialState: ScannerStateFast;
  matchingTagPairs: readonly [string, string][];
}): ScannerFast {
  // 创建 MultiLineStream 实例,用于处理多行文本流
  const stream = new MultiLineStream(input, initialOffset, matchingTagPairs);
  let state: ScannerStateFast = initialState;
  let tokenOffset: number;

  /**
   * 获取下一个元素名称(标签名或属性名)
   *
   * @returns {string | undefined} - 返回下一个元素名称,如果没有,则返回 undefined
   */
  function nextElementName(): string | undefined {
    let result = stream.advanceIfRegExp(htmlTagNameRE);
    if (result === undefined) {
      if (stream.advanceIfRegExp(htmlTagNameEmptyRE)) {
        result = '';
      }
    }
    return result;
  }

  let lastTagName: string | undefined;
  // 定义 scan 函数,用于扫描并返回记号类型
  function scan(): TokenTypeFast {
    tokenOffset = stream.position;
    if (stream.eos()) {
      return TokenTypeFast.EOS;
    }
    switch (state) {
      case ScannerStateFast.AfterOpeningEndTag:
        const tagName = nextElementName();
        if (tagName) {
          state = ScannerStateFast.WithinEndTag;
          return TokenTypeFast.EndTag;
        } else if (stream.peekRight(0) === '>') {
          state = ScannerStateFast.WithinEndTag;
          return TokenTypeFast.EndTag;
        }
        return TokenTypeFast.Unknown;

      case ScannerStateFast.AfterOpeningStartTag:
        lastTagName = nextElementName();
        if (lastTagName !== undefined) {
          if (lastTagName === '') {
            tokenOffset = stream.position;
          }
          state = ScannerStateFast.WithinStartTag;
          return TokenTypeFast.StartTag;
        }
        // this is a tag like "<>"
        if (stream.peekRight() === '>') {
          state = ScannerStateFast.WithinStartTag;
          return TokenTypeFast.StartTag;
        }
        // At this point there is no tag name sign after the opening tag "<"
        // E.g. "< div"
        // So we just assume that it is text
        state = ScannerStateFast.WithinContent;
        return scan();
      default:
        break;
    }
  }

  // 返回 ScannerFast 对象
  return {
    scan,
    stream,
    getTokenText() {
      return stream.getSource().slice(tokenOffset, stream.position);
    },
    set state(newState: any) {
      state = newState;
    }
  };
}

Released under the MIT License.