跳转至

RNA Secondary Structure

multimolecule.io.rna_secondary_structure.read_dbn

Python
read_dbn(path: str | Path) -> RnaSecondaryStructureRecord

Parse a dot-bracket (.db/.dbn) file and return a single record.

Source code in multimolecule/io/rna_secondary_structure.py
Python
def read_dbn(path: str | Path) -> RnaSecondaryStructureRecord:
    r"""Parse a dot-bracket (.db/.dbn) file and return a single record."""

    path = Path(path)
    lines = _read_lines(path, drop_comments=True)
    if not lines:
        raise InvalidStructureFile(f"No dot-bracket records found in {path!s}")
    idx = 0
    line = lines[idx]
    id = None
    if line.startswith(">"):
        id = line[1:].strip() or None
        idx += 1
        if idx >= len(lines):
            raise InvalidStructureFile(f"Missing sequence line after defline in {path!s}")
        line = lines[idx]
    sequence = line
    idx += 1
    if idx >= len(lines):
        raise InvalidStructureFile(f"Missing dot-bracket line after sequence in {path!s}")
    dot_line = lines[idx]
    dot_bracket = dot_line
    if " " in dot_bracket or "\t" in dot_bracket:
        terms = dot_bracket.split()
        if len(terms) >= 1 and len(terms[0]) == len(sequence):
            dot_bracket = terms[0]
        else:
            raise InvalidStructureFile(f"Dot-bracket line contains unexpected whitespace in {path!s}: {dot_line}")
    if len(sequence) != len(dot_bracket):
        raise InvalidStructureFile(f"Sequence and dot-bracket lengths differ in {path!s}")
    return RnaSecondaryStructureRecord(sequence=sequence, dot_bracket=dot_bracket, id=id)

multimolecule.io.rna_secondary_structure.write_dbn

Python
write_dbn(
    record: RnaSecondaryStructureRecord, path: str | Path
) -> Path

Write a dot-bracket (.db/.dbn) file for a single record.

Source code in multimolecule/io/rna_secondary_structure.py
Python
def write_dbn(record: RnaSecondaryStructureRecord, path: str | Path) -> Path:
    r"""Write a dot-bracket (.db/.dbn) file for a single record."""

    if not isinstance(record, RnaSecondaryStructureRecord):
        raise TypeError("record must be a RnaSecondaryStructureRecord")
    path = Path(path)
    with path.open("w") as fh:
        if record.id:
            fh.write(f">{record.id}\n")
        fh.write(f"{record.sequence}\n")
        fh.write(f"{record.dot_bracket}\n")
    return path

multimolecule.io.rna_secondary_structure.read_bpseq

Python
read_bpseq(path: str | Path) -> RnaSecondaryStructureRecord

Parse a BPSEQ file and return a record with dot-bracket notation.

Source code in multimolecule/io/rna_secondary_structure.py
Python
def read_bpseq(path: str | Path) -> RnaSecondaryStructureRecord:
    r"""Parse a BPSEQ file and return a record with dot-bracket notation."""

    path = Path(path)
    rows = []
    for lineno, line in enumerate(_read_lines(path, drop_comments=True), 1):
        parts = line.split()
        if len(parts) != 3:
            raise InvalidStructureFile(f"Expected 3 columns in BPSEQ at line {lineno}: {line}")
        try:
            idx = int(parts[0]) - 1
            base = parts[1]
            pair = int(parts[2])
        except Exception as exc:  # pragma: no cover - defensive
            raise InvalidStructureFile(f"Invalid BPSEQ data at line {lineno}: {line}") from exc
        if idx < 0:
            raise InvalidStructureFile(f"Invalid index {idx + 1} at line {lineno}")
        pair_idx = pair - 1 if pair != 0 else -1
        rows.append((idx, base, pair_idx, lineno))

    if not rows:
        raise InvalidStructureFile(f"No BPSEQ records found in {path!s}")

    length = max(idx for idx, _, _, _ in rows) + 1
    sequence = [""] * length
    pair_by_index: Dict[int, int] = {}
    for idx, base, pair_idx, lineno in rows:
        if idx >= length:
            raise InvalidStructureFile(f"Index {idx + 1} out of bounds at line {lineno}")
        if sequence[idx]:
            raise InvalidStructureFile(f"Position {idx + 1} duplicated at line {lineno}")
        sequence[idx] = base
        if pair_idx == -1:
            continue
        if pair_idx < 0:
            raise InvalidStructureFile(f"Invalid pair index {pair_idx + 1} at line {lineno}")
        if idx == pair_idx:
            raise InvalidStructureFile(f"Position {idx + 1} paired to itself (line {lineno})")
        if pair_idx in pair_by_index and pair_by_index[pair_idx] != idx:
            raise InvalidStructureFile(
                f"Position {pair_idx + 1} paired to both {pair_by_index[pair_idx] + 1} and {idx + 1} "
                f"(line {lineno})"
            )
        if idx in pair_by_index:
            raise InvalidStructureFile(f"Position {idx + 1} paired multiple times (line {lineno})")
        pair_by_index[idx] = pair_idx

    if any(base == "" for base in sequence):
        raise InvalidStructureFile(f"Missing sequence positions in {path!s}")

    for i, j in pair_by_index.items():
        if pair_by_index.get(j) != i:
            raise InvalidStructureFile(f"Inconsistent pairing: {i + 1} -> {j + 1} but reverse not found")

    pairs = [(i, j) for i, j in pair_by_index.items() if i < j]
    pairs = np.array(pairs, dtype=int) if pairs else np.empty((0, 2), dtype=int)
    dot_bracket = notations.pairs_to_dot_bracket(pairs, length=length, unsafe=True)
    return RnaSecondaryStructureRecord(sequence="".join(sequence), dot_bracket=dot_bracket, id=path.stem)

multimolecule.io.rna_secondary_structure.write_bpseq

Python
write_bpseq(
    record: RnaSecondaryStructureRecord, path: str | Path
) -> Path

Write a BPSEQ file from a record with dot-bracket notation.

Source code in multimolecule/io/rna_secondary_structure.py
Python
def write_bpseq(record: RnaSecondaryStructureRecord, path: str | Path) -> Path:
    r"""Write a BPSEQ file from a record with dot-bracket notation."""

    if not isinstance(record, RnaSecondaryStructureRecord):
        raise TypeError("record must be a RnaSecondaryStructureRecord")
    path = Path(path)
    pairs = notations.dot_bracket_to_pairs(record.dot_bracket)
    pair_index = _pair_index_from_pairs(pairs, len(record))
    with path.open("w") as fh:
        for idx, base in enumerate(record.sequence):
            partner = pair_index[idx]
            out_partner = partner + 1 if partner != -1 else 0
            fh.write(f"{idx + 1} {base} {out_partner}\n")
    return path

multimolecule.io.rna_secondary_structure.read_rna_secondary_structure_st

Python
read_rna_secondary_structure_st(
    path: str | Path,
) -> RnaSecondaryStructureRecord

Parse sequence and dot-bracket data from a bpRNA .st/.sta file.

Source code in multimolecule/io/rna_secondary_structure.py
Python
def read_rna_secondary_structure_st(path: str | Path) -> RnaSecondaryStructureRecord:
    r"""Parse sequence and dot-bracket data from a bpRNA .st/.sta file."""

    path = Path(path)
    lines = _read_lines(path, drop_comments=False)
    if not lines:
        raise InvalidStructureFile(f"No .st records found in {path!s}")
    idx = 0
    record_id = None
    while idx < len(lines):
        line = lines[idx].strip()
        if not line:
            idx += 1
            continue
        if not line.startswith("#"):
            break
        header = line[1:]
        if ":" in header:
            key, value = header.split(":", 1)
            key = key.strip().lower().replace(" ", "")
            if key in {"id", "name"}:
                record_id = value.strip() or None
        idx += 1
    if idx + 1 >= len(lines):
        raise InvalidStructureFile(f"Incomplete .st record in {path!s}")
    sequence = lines[idx].strip()
    dot_bracket = lines[idx + 1].strip()
    if len(sequence) != len(dot_bracket):
        raise InvalidStructureFile(f"Sequence and dot-bracket lengths differ in {path!s}")
    return RnaSecondaryStructureRecord(sequence=sequence, dot_bracket=dot_bracket, id=record_id)

multimolecule.io.rna_secondary_structure.read_st

Python
read_st(path: str | Path) -> BpRnaRecord

Parse a .st/.sta file and return a single bpRNA record.

Source code in multimolecule/io/rna_secondary_structure.py
Python
def read_st(path: str | Path) -> BpRnaRecord:
    r"""Parse a .st/.sta file and return a single bpRNA record."""

    path = Path(path)
    lines = _read_lines(path, drop_comments=False)
    if not lines:
        raise InvalidStructureFile(f"No .st records found in {path!s}")

    headers: Dict[str, str] = {}
    idx = 0
    while idx < len(lines):
        line = lines[idx].strip()
        if not line:
            idx += 1
            continue
        if not line.startswith("#"):
            break
        header = line[1:]
        if ":" in header:
            key, value = header.split(":", 1)
            key = key.strip().lower().replace(" ", "")
            headers[key] = value.strip()
        idx += 1
    if idx + 3 >= len(lines):
        raise InvalidStructureFile(f"Incomplete .st record in {path!s}")

    sequence = lines[idx].strip()
    dot_bracket = lines[idx + 1].strip()
    structure_array = lines[idx + 2].strip()
    knot_array = lines[idx + 3].strip()
    idx += 4

    if len(sequence) != len(dot_bracket):
        raise InvalidStructureFile(f"Sequence and dot-bracket lengths differ in {path!s}")
    if len(sequence) != len(structure_array):
        raise InvalidStructureFile(f"Sequence and structure array lengths differ in {path!s}")
    if len(sequence) != len(knot_array):
        raise InvalidStructureFile(f"Sequence and knot array lengths differ in {path!s}")

    length_header = headers.get("length")
    if length_header:
        try:
            expected = int("".join(ch for ch in length_header if ch.isdigit()))
        except ValueError as exc:
            raise InvalidStructureFile(f"Invalid Length header in {path!s}") from exc
        if expected != len(sequence):
            raise InvalidStructureFile(f"Length header {expected} != sequence length {len(sequence)} in {path!s}")

    page_number = None
    page_header = headers.get("pagenumber")
    if page_header:
        try:
            page_number = int(page_header)
        except ValueError:
            page_number = None

    structure_types: Dict[str, List[str]] = {}
    for line in lines[idx:]:
        if not line.strip():
            continue
        if line.startswith("#"):
            lowered = line.lower()
            if lowered.startswith("#id:") or lowered.startswith("#name:"):
                raise InvalidStructureFile(f"Multiple .st records found in {path!s}")
            continue
        key = _st_line_key(line)
        structure_types.setdefault(key, []).append(_ensure_newline(line))

    return BpRnaRecord(
        sequence=sequence,
        dot_bracket=dot_bracket,
        id=headers.get("id") or headers.get("name"),
        page_number=page_number,
        structure_array=structure_array,
        knot_array=knot_array,
        structure_types=structure_types,
    )

multimolecule.io.rna_secondary_structure.write_st

Python
write_st(
    record: BpRnaRecord | RnaSecondaryStructureRecord,
    path: str | Path,
    **_: object
) -> Path

Write a .st/.sta file for a single bpRNA record.

Source code in multimolecule/io/rna_secondary_structure.py
Python
def write_st(record: BpRnaRecord | RnaSecondaryStructureRecord, path: str | Path, **_: object) -> Path:
    r"""Write a .st/.sta file for a single bpRNA record."""

    if not isinstance(record, (BpRnaRecord, RnaSecondaryStructureRecord)):
        raise TypeError("record must be a RnaSecondaryStructureRecord or BpRnaRecord")
    if isinstance(record, RnaSecondaryStructureRecord) and not isinstance(record, BpRnaRecord):
        record = BpRnaRecord.from_rna_secondary_structure_record(record)
    path = Path(path)
    with path.open("w") as fh:
        id = record.id or path.stem
        pairs = notations.dot_bracket_to_pairs(record.dot_bracket)
        dot_bracket = notations.pairs_to_dot_bracket(pairs, length=len(record), unsafe=True)
        page_number = record.page_number if record.page_number is not None else 1

        fh.write(f"#Name: {id}\n")
        fh.write(f"#Length: {len(record)}\n")
        fh.write(f"#PageNumber: {page_number}\n")
        fh.write(f"{record.sequence}\n")
        fh.write(f"{dot_bracket}\n")
        fh.write(f"{record.structure_array}\n")
        fh.write(f"{record.knot_array}\n")

        for key in STRUCTURE_TYPE_KEYS:
            for item in record.structure_types.get(key, []):
                fh.write(_ensure_newline(item))
        extra_keys = sorted(k for k in record.structure_types if k not in STRUCTURE_TYPE_KEYS)
        for key in extra_keys:
            for item in record.structure_types.get(key, []):
                fh.write(_ensure_newline(item))
    return path