/* Copyright (C) 1999,2000,2001 Washington University, Saint Louis, Missouri USA. All Rights Reserved. */ #ifndef __XDF_TYPES__ #define __XDF_TYPES__ /************************************************************** CAUTIONS: THIS FILE FORMAT WILL CHANGE TO PROVIDE SUPPORT FOR NEW FEATURES. A few C-language enums are declared below and used in the structures which follow. Enums are stored in XDF files using bigendian, 4-byte fields, just as are elements of type "unsigned long". Elements of type "Euint" (extended unsigned integer or "unsigned long long") are stored in 8-byte fields. The size of "off_t" elements, used to store file offsets, is determined by the value found in "offlen", which may range from 4-8, with default precision of 4 bytes when a database is created. Larger file offsets than the default will be needed for huge databases: nucleotide sequence databases containing more than about 15 giga-bases or protein sequence databases with more than about 4 giga-amino acids. Use of larger file offsets is a command line option to the XDFORMAT program; the program attempts to determine on its own when offsets larger than 4-bytes are required, but such determination is not possible on files read from stdin and does not account for possible appends of additional data after the database is created. When data is read from stdin, file offsets of 5 bytes are used by default. Once the offset precision has been established for a given database, it can not be altered. The in-memory sizes of elements in these structures, or of the structures in their entirety, can not be assumed to coincide with the size of each item or structure on disk. The ulong2b data type is stored as an encoded 30-bit unsigned integer. The high-order 2 bits of the first byte indicate the number of bytes that follow the first (0-3). The encoded value is otherwise stored in bigendian orientation. Watch for NULL file offsets in the Table (TAB) file; these are unused slots. The format of identifier index (IDX) files is not simple and is not shown here. The C structures relevant to the IDX file type are shown, however, to indicate the data that are maintained. ***************************************************************/ /* These are the file types described below in C structure typedefs. */ /* Each file type has type-specific header and record formats */ typedef enum xdf_file_type { XDF_FILETYPE_TAB = 1 , /* table file type */ XDF_FILETYPE_DEF = 2 , /* definitions file type */ XDF_FILETYPE_SEQ = 3 , /* sequence file type */ XDF_FILETYPE_IDX = 4 /* identifier index file type */ } XdfFileType, PNTR XdfFileTypePtr; typedef enum xdf_seq_type { XDF_SEQTYPE_NUCLEOTIDE = 1 , XDF_SEQTYPE_PEPTIDE = 2 } XdfSeqType, PNTR XdfSeqTypePtr; typedef enum xdf_pack { XDF_PACK_NONE = 0 , XDF_PACK_COMPACT = 1 } XdfPack, PNTR XdfPackPtr; #define XDF_STDHDR_INDEXED 1 /*== Standard file header structure ===========================*/ /* XdfStdHdr contains elements that are often used in each of the 3 types of files: Sequence, Definitions and Table (Index). */ typedef struct xdf_std_header { off_t off0; /* starting offset within file */ #define XDF_SIGNATURE 0xd8c4c600 /* "XDF0" + 128 */ Uint4 signature; /* signature unique to "XDF" files */ #define XDF_STDHDR_FMT 1 Uint4 stdhdr_fmt; /* version number of this std header's format */ Uint4 stdhdr_len; /* length of std header alone */ Uint4 tothdr_len; /* total length of std + specialized header */ Uint4 key; /* value common to all files of this db */ XdfSeqType seq_type; /* type of sequences (peptide/nucleotide) */ XdfFileType file_type; /* table, sequence, or definitions */ Uint4 file_fmt; /* version number of file type-specific format */ Uint4 rec_len_max; /* length of longest record */ Uint4 flags; /* See XDF_STDHDR_* */ char reserved[32]; struct stat statbuf; } XdfStdHdr, PNTR XdfStdHdrPtr; #define XDF_STDHDR_FLAG_IDINDEX 1 /*== Table file Header and Record structures ==================*/ /* A Table file contains the file offset for each sequence record and its associated description record. */ #define XDF_FILEFMT_TAB 1 typedef struct xdf_tab_header { XdfStdHdr stdhdr; off_t _seqoff; /* offset to first sequence offset (computed) */ Uint4 flags; Uint4 rec_cnt; /* no. of records allocated in database */ Uint4 rec_live; /* number of "live" records in database */ Uint4 offlen; /* encoded length of each file offset value */ Euint seqlen_tot; /* total length of sequences in the database */ Uint4 seqlen_max; /* length of longest sequence (unpacked) */ Euint edits_tot; /* total no. of edits in database */ Uint4 edits_max; /* max. no. of edits in one sequence */ char title[256]; /* C-string name for the database (user provided) */ char version[64]; /* C-string version (user provided) */ char reldate[64]; /* C-string release date (user provided) */ char credate[64]; /* C-string creation date (automatic) */ char moddate[64]; /* C-string modification date (automatic) */ char alpha_std_name[64]; /* C-string name of standard alphabet */ Uint4 alpha_std_version; char alpha_edit_name[64]; /* C-string name of edit alphabet */ Uint4 alpha_edit_version; } XdfTabHdr, PNTR XdfTabHdrPtr; typedef struct xdf_tab_record { off_t seq_off; /* minimum 4 bytes, actual length in `offlen' */ off_t def_off; /* minimum 4 bytes, actual length in `offlen' */ } XdfTabRec, PNTR XdfTabRecPtr; /*== Sequence file Header and Record structures ===============*/ #define XDF_FILEFMT_SEQ 1 typedef struct xdf_seq_header { XdfStdHdr stdhdr; Uint4 flags; Uint4 _std_bpl; /* bits per letter in standard alphabet */ Uint4 _edit_bpl; /* bits per letter in edit alphabet */ } XdfSeqHdr, PNTR XdfSeqHdrPtr; typedef struct xdf_edit { ulong2b start; /* starting offset of the edit */ /* CAUTION: repcnt, editch ARE ENCODED TOGETHER */ ulong2b repcnt; /* no. of repetitions MINUS 1 of the edit character */ BLAST_Letter editch; /* edit character */ } XdfEdit, PNTR XdfEditPtr; typedef struct xdf_seq_record { ulong2b attrib; /* sequence attribute bit flags (see below) */ ulong2b seqlen; /* sequence length (unpacked and w/o sentinels) */ unsigned char PNTR seq; /* the sequence, possibly with sentinel octets */ /* `edit_len' and `editp' elements are conditionally present--only if the XDF_SEQREC_EDIT flag is set in the `attrib' element do they appear. */ ulong2b edit_cnt; /* no. of used elements in editp */ XdfEditPtr editp; /* lists are stored in increasing start order */ Uint4 _edit_alloc; /* allocated no. of elements in editp */ } XdfSeqRec, PNTR XdfSeqRecPtr; /**********************************************************************/ /*-- attribute bit flags for XdfSeqRec records (`attrib' element) --*/ /* ISRNA is set if T's should be displayed as U's */ #define XDF_SEQREC_ISRNA 0x01 /* EDIT is set if one or more edits are present */ #define XDF_SEQREC_EDIT 0x02 #define XDF_SEQREC_RESERVED3 0x04 #define XDF_SEQREC_RESERVED4 0x08 #define XDF_SEQREC_RESERVED5 0x10 #define XDF_SEQREC_RESERVED6 0x20 /* more elements here, if size of `attrib' is expanded beyond 1 byte */ /**********************************************************************/ /*== Definitions file Header and Record structures ============*/ #define XDF_FILEFMT_DEF 1 typedef struct xdf_def_header { XdfStdHdr stdhdr; Uint4 flags; } XdfDefHdr, PNTR XdfDefHdrPtr; typedef struct xdf_def_record { ulong2b deflen; char *defp; } XdfDefRec, PNTR XdfDefRecPtr; /*== Index file Header structures ============*/ #define XDF_FILEFMT_IDX 1 typedef struct xdf_idx_header { XdfStdHdr stdhdr; Uint8 nindexed; /* indexed IDs */ Uint8 dupids; /* duplicate IDs */ Uint8 redids; /* redundant IDs */ Uint8 noids; /* no indexed IDs */ unsigned accverflag; /* how to treat versions in ACC.VER */ unsigned ntags; dbtag_node_ptr tagnodes[DBTAG_CNT]; /* tags found in tagflag[], ntags long */ unsigned char _tagmask[DBTAG_CNT]; /* bit set => do index tag+token combo */ unsigned char PNTR tagmask; unsigned char _tagflag[DBTAG_CNT]; unsigned char PNTR tagflag; /* bit set => tag+token combo has been seen/indexed */ } XdfIdxHdr, PNTR XdfIdxHdrPtr; #endif /* !__XDF_TYPES__ */