This page is also available in: 日本語

This page provides some examples of SQL queries that can be executed from the PDBj Mine interface at /#!mine. You can use these samples at SQL Search of PDBj Mine or PDBj Mine:REST API. The result are returned in web pages or a test file such as XML, CSV, and TSV, respectively.

[Search for all PDB IDs:]

SELECT pdbid FROM brief_summary

[Search for all count (PDB IDs):]

SELECT count(pdbid) FROM brief_summary

[Search for all PDB IDs (1g*):]

SELECT pdbid FROM brief_summary where pdbid like '1g%'

[Search for all summary (1g*):]

SELECT pdbid, summary FROM brief_summary b where pdbid like '1g%'

[Search for PDB ID (1gof):]

SELECT doc FROM xmldoc where pdbid='1gof'

[Search for PDBID that contains keywords]alcohol dehydrogenase

SELECT pdbid FROM brief_summary where plainto_tsquery('alcohol dehydrogenase') @@ tsv

[Search for citation_author category (1gof):]

SELECT * FROM xp_category('1gof','citation_author')

[Search for citation_author ('Ito, N.'):]

SELECT pdbid, summary 
FROM brief_summary b 
WHERE '{"Ito, N."}' <@ citation_author

[Search for citation_author ('Ito, N.' and 'Phillips, S.E.V.'):]

SELECT pdbid, summary 
FROM brief_summary b 
WHERE '{"Ito, N.","Phillips, S.E.V."}' <@ citation_author

[Search for citation_author ['Ito, N.'] (primary):]

SELECT pdbid, summary 
FROM brief_summary b 
WHERE '{"Ito, N."}' <@ citation_author_pri

[Search for citation title ['alcohol dehydrogenase']:]

SELECT pdbid, summary 
FROM brief_summary b 
WHERE plainto_tsquery('english','alcohol dehydrogenase') @@ citation_title

[Search for journal name ['molecular cell']:]

SELECT pdbid, summary 
FROM brief_summary b 
WHERE plainto_tsquery('english','molecular cell') @@ citation_journal

[Search for journal year ['2001']:]

SELECT pdbid, summary 
FROM brief_summary b 
WHERE '{2001}' && citation_year

[Search for journal volume ['555']:]

SELECT pdbid, summary 
FROM brief_summary b 
WHERE '{"555"}' && citation_volume

[Search for exprimental method [SOLUTION NMR]:]

SELECT pdbid, summary
FROM brief_summary b
WHERE 6 = ANY(exptl_method)

Note on experimental methods

In the brief_summary table, experimental method (exptl_method column) is defined as an enumeration of keywords defined as follows:

  1. X-RAY DIFFRACTION
  2. NEUTRON DIFFRACTION
  3. FIBER DIFFRACTION
  4. ELECTRON CRYSTALLOGRAPHY
  5. ELECTRON MICROSCOPY
  6. SOLUTION NMR
  7. SOLID-STATE NMR
  8. SOLUTION SCATTERING
  9. POWDER DIFFRACTION
  10. INFRARED SPECTROSCOPY
  11. EPR
  12. THEORETICAL MODEL
  13. Others

[Search for journal year[1991] volume[266] exp_method [X-RAY DIFFRACTION]:]

SELECT pdbid, summary 
FROM brief_summary b 
WHERE '{1991}' && citation_year AND '{"266"}' && citation_volume 
AND '{1}' && exptl_method

[Search for exptl_crystal/density_percent_sol (1iqq):]

SELECT p.val AS density_percent_sol
FROM brief_summary s 
JOIN "//exptl_crystal/density_percent_sol" p 
    ON s.docid = p.docid WHERE s.pdbid = '1iqq'

[Search for citation/pdbx_database_id_DOI (1iqq):]

SELECT p.val AS "pdbx_database_id_DOI"
FROM brief_summary s 
JOIN "//citation/pdbx_database_id_DOI" p 
    ON s.docid = p.docid WHERE s.pdbid = '1iqq'

[Search the PubMed ID of 1iqq (citation/pdbx_database_id_PubMed):]

SELECT p.val AS "pdbx_database_id_PubMed"
FROM brief_summary s 
JOIN "//citation/pdbx_database_id_PubMed" p 
    ON s.docid = p.docid WHERE s.pdbid = '1iqq'

[Result]

 pdbx_database_id_PubMed: 
11577107

[Search the PubMed ID of 1ao8 and 1ai9 (citation/pdbx_database_id_PubMed):]

SELECT s.pdbid, p.val AS "pdbx_database_id_PubMed"
FROM brief_summary s 
JOIN "//citation/pdbx_database_id_PubMed" p 
    ON s.docid = p.docid WHERE s.pdbid = '1ai9' or s.pdbid='1ao8'

[Result]

 1ao8 
 pdbx_database_id_PubMed: 
9514736 1ai9 
 pdbx_database_id_PubMed: 
9374515

[Search for citation/pdbx_database_id_DOI (1i*):]

SELECT pdbid, p.val AS "pdbx_database_id_DOI"
FROM brief_summary s 
JOIN "//citation/pdbx_database_id_DOI" p 
    ON s.docid = p.docid WHERE s.pdbid like '1i%'

[Search for summary release date [after 2000-01-01]:]

SELECT pdbid, summary 
FROM brief_summary 
WHERE release_date >= '2000-01-01'

[Search for summary release date [after 2003-01-01] and deposition date [after 2001-01-01] and PDB ID (2k*) :]

SELECT pdbid, summary 
FROM brief_summary b 
WHERE release_date >= '2003-01-01' 
AND deposition_date >= '2001-01-01' 
AND b.pdbid LIKE '2k%'

[Search for summary chain type [polypeptide(D)]:]

SELECT pdbid, summary FROM brief_summary WHERE '{1}' <@ chain_type

Note on chain types

The chain types of polymers in the brief_summary table are defined as an enumeration of specific keywords. The following chain types are currently available:

  1. polypeptide(D)
  2. polypeptide(L)
  3. polydeoxyribonucleotide
  4. polyribonucleotide
  5. polysaccharide(D)
  6. polysaccharide(L)
  7. polydeoxyribonucleotide/polyribonucleotide hybrid
  8. cyclic-pseudo-peptide
  9. other

[Search for summary chain type [polypeptide(D),polypeptide(L)]:]

SELECT pdbid, summary FROM brief_summary WHERE '{1,2}' <@ chain_type

[Search for summary chain type [no polydeoxyribonucleotide]:]

SELECT pdbid, summary FROM brief_summary WHERE NOT ('{3}' && chain_type)

[Search for summary chain type [no polydeoxyribonucleotide, polyribonucleotide]:]

SELECT pdbid, summary 
FROM brief_summary 
WHERE NOT ('{3,4}' && chain_type)

[Search for summary chain type [all no except polysaccharide(D)]:]

SELECT pdbid, summary 
FROM brief_summary 
WHERE ('{5}' <@ chain_type) AND NOT ('{1,2,3,4,6,7,8,9}' && chain_type)

[Search for exptl/method, unit axes, unit angles, exptl_crystal/density_percent_sol, exptl_crystal/density_percent_sol (1iqq, 1ioo):]

SELECT s.pdbid, p1.val AS "datablockName", p2.val AS "method"
          , l1.val AS "length_a", l2.val AS "length_b", l3.val AS "length_c"
          , p3.val AS "density_percent_sol"
    , (SELECT * FROM xp_category(s.pdbid,'exptl_crystal_grow_comp')) 
FROM brief_summary s 
JOIN "//@datablockName" p1 ON s.docid = p1.docid 
JOIN "//exptl/@method" p2 ON s.docid = p2.docid 
JOIN "E://cell" e1 ON s.docid = e1.docid 
JOIN "//cell/length_a" l1 
        ON l1.docid = e1.docid AND l1.pos BETWEEN e1.pstart AND e1.pend 
JOIN "//cell/length_b" l2 
        ON l2.docid = e1.docid AND l2.pos BETWEEN e1.pstart AND e1.pend 
JOIN "//cell/length_c" l3 
        ON l3.docid = e1.docid AND l3.pos BETWEEN e1.pstart AND e1.pend 
JOIN "//exptl_crystal/density_percent_sol" p3 ON p3.docid = s.docid
WHERE (s.pdbid = '1iqq' OR s.pdbid = '1ioo')

[Search for entity names with EC number 1.1.1.1:]

SELECT s.pdbid, p2.val AS "pdbx_description"
FROM brief_summary s
JOIN "E://entity" e ON s.docid = e.docid
JOIN "//entity/pdbx_ec" p1 
        ON s.docid = p1.docid AND p1.pos BETWEEN e.pstart AND e.pend
JOIN "//entity/pdbx_description" p2 
        ON s.docid = p2.docid AND p2.pos BETWEEN e.pstart AND e.pend
WHERE p1.val = '1.1.1.1'

[Search for all entries with resolution less than or equal to 2.0 angstroms:]

SELECT s.pdbid, p.val AS "resolution"
FROM brief_summary s
JOIN "//refine/ls_d_res_high" p ON s.docid = p.docid
WHERE p.val <= 2.0

[List PDB ID and total polymer molecular weights:]

SELECT s.pdbid, SUM(p2.val * p3.val) AS weight
FROM brief_summary s
JOIN "E://entity" e ON e.docid = s.docid
JOIN "//entity/type" p1
        ON p1.docid = e.docid AND p1.pos BETWEEN e.pstart AND e.pend
JOIN "//entity/pdbx_number_of_molecules" p2
        ON p2.docid = e.docid AND p2.pos BETWEEN e.pstart AND e.pend
JOIN "//entity/formula_weight" p3
        ON p3.docid = e.docid AND p3.pos BETWEEN e.pstart AND e.pend
WHERE p1.val = 'polymer'
GROUP BY s.pdbid
ORDER BY weight DESC

[Search for all entries with pdbx_descriptor, pdbx_keywords, unit axes, unit angles, number of polymer molecules, and polymer weight:]

SELECT s.pdbid, p1.val AS pdbx_descriptor, p2.val AS pdbx_keywords
           , p3.val AS length_a, p4.val AS length_b, p5.val AS length_c
           , p6.val AS angle_alpha, p7.val AS angle_beta, p8.val AS angle_gamma
           , SUM(p22.val) AS number_of_molecules
           , SUM(p22.val * p23.val) AS weight
FROM brief_summary s
JOIN "//struct/pdbx_descriptor" p1 ON p1.docid = s.docid
JOIN "//struct_keywords/pdbx_keywords" p2 ON p2.docid = s.docid
JOIN "E://cell" e1 ON e1.docid = s.docid
JOIN "//cell/length_a" p3 
        ON p3.docid = e1.docid AND p3.pos BETWEEN e1.pstart AND e1.pend
JOIN "//cell/length_b" p4
        ON p4.docid = e1.docid AND p4.pos BETWEEN e1.pstart AND e1.pend
JOIN "//cell/length_c" p5
        ON p5.docid = e1.docid AND p5.pos BETWEEN e1.pstart AND e1.pend
JOIN "//cell/angle_alpha" p6
        ON p6.docid = e1.docid AND p6.pos BETWEEN e1.pstart AND e1.pend
JOIN "//cell/angle_beta" p7
        ON p7.docid = e1.docid AND p7.pos BETWEEN e1.pstart AND e1.pend
JOIN "//cell/angle_gamma" p8
        ON p8.docid = e1.docid AND p8.pos BETWEEN e1.pstart AND e1.pend
JOIN "E://entity" e2 ON e2.docid = s.docid
JOIN "//entity/type" p21
        ON p21.docid = e2.docid AND p21.pos BETWEEN e2.pstart AND e2.pend
JOIN "//entity/pdbx_number_of_molecules" p22 
        ON p22.docid = e2.docid AND p22.pos BETWEEN e2.pstart AND e2.pend
JOIN "//entity/formula_weight" p23
        ON p23.docid = e2.docid AND p23.pos BETWEEN e2.pstart AND e2.pend
WHERE p21.val = 'polymer'
GROUP BY s.pdbid, p1.val, p2.val, p3.val, p4.val, p5.val, p6.val, p7.val, p8.val

NOTE: The mmCIF category views

As can be seen in the above example, writing queries that involve many mmCIF category can be tedious. For convenience, views ( mmCIF category views ) that group mmCIF items in each category into one row are defined. Using these views, a query that is semantically equivalent to the one above can be written as

SELECT s.pdbid, c1.pdbx_descriptor, c2.pdbx_keywords
, c3.length_a , c3.length_b , c3.length_c
, c3.angle_alpha , c3.angle_beta , c3.angle_gamma
, SUM(c4.pdbx_number_of_molecules) AS number_of_molecules
, SUM(c4.pdbx_number_of_molecules * c4.formula_weight) AS weight
FROM brief_summary s
JOIN struct c1 ON c1.docid = s.docid
JOIN struct_keywords c2 ON c2.docid = s.docid
JOIN cell c3 ON c3.docid = s.docid
JOIN entity c4 ON c4.docid = s.docid
WHERE c4.type = 'polymer'
GROUP BY s.pdbid, c1.pdbx_descriptor, c2.pdbx_keywords
, c3.length_a , c3.length_b , c3.length_c
, c3.angle_alpha , c3.angle_beta , c3.angle_gamma
ORDER BY s.pdbid

Here, struct, struct_keywords, cell, entity are the mmCIF category views. The view names and item (column) names are defined as in the mmCIF dictionary as much as possible. For some item names, you may need to enclose them by double quotes because they involve characters that are not allowed as SQL (literal) identifiers.

There are a few caveats to note:

  1. Querying with the mmCIF category views can be slow sometimes, especially if a category contains many items, and hence the view involves many (left outer) joins.
  2. Since many of the mmCIF items lack actual data, the value of those items are set to NULL in the mmCIF category views.

For your information, the following is the definition of the mmCIF category view for the entity category:

CREATE VIEW entity (
 docid
  , "id"  , "details"  , "formula_weight"  , "pdbx_description"  , "pdbx_ec"
  , "pdbx_formula_weight_exptl"  , "pdbx_formula_weight_exptl_method"  , "pdbx_fragment"
  , "pdbx_modification"  , "pdbx_mutation"  , "pdbx_number_of_molecules"
  , "pdbx_parent_entity_id"  , "pdbx_target_id"  , "src_method"  , "type"
) AS SELECT e.docid, 
p0.val
  , p1.val  , p2.val  , p3.val  , p4.val  , p5.val
  , p6.val  , p7.val  , p8.val  , p9.val  , p10.val
  , p11.val  , p12.val  , p13.val  , p14.val 
FROM "E:/datablock/entityCategory/entity" e
LEFT OUTER JOIN "/datablock/entityCategory/entity/@id" p0 
    ON p0.docid = e.docid AND p0.pos BETWEEN e.pstart AND e.pend
LEFT OUTER JOIN "/datablock/entityCategory/entity/details" p1 
    ON p1.docid = e.docid AND p1.pos BETWEEN e.pstart AND e.pend
LEFT OUTER JOIN "/datablock/entityCategory/entity/formula_weight" p2 
    ON p2.docid = e.docid AND p2.pos BETWEEN e.pstart AND e.pend
LEFT OUTER JOIN "/datablock/entityCategory/entity/pdbx_description" p3 
    ON p3.docid = e.docid AND p3.pos BETWEEN e.pstart AND e.pend
LEFT OUTER JOIN "/datablock/entityCategory/entity/pdbx_ec" p4 
    ON p4.docid = e.docid AND p4.pos BETWEEN e.pstart AND e.pend
LEFT OUTER JOIN "/datablock/entityCategory/entity/pdbx_formula_weight_exptl" p5 
    ON p5.docid = e.docid AND p5.pos BETWEEN e.pstart AND e.pend
LEFT OUTER JOIN "/datablock/entityCategory/entity/pdbx_formula_weight_exptl_method" p6 
    ON p6.docid = e.docid AND p6.pos BETWEEN e.pstart AND e.pend
LEFT OUTER JOIN "/datablock/entityCategory/entity/pdbx_fragment" p7 
    ON p7.docid = e.docid AND p7.pos BETWEEN e.pstart AND e.pend
LEFT OUTER JOIN "/datablock/entityCategory/entity/pdbx_modification" p8 
    ON p8.docid = e.docid AND p8.pos BETWEEN e.pstart AND e.pend
LEFT OUTER JOIN "/datablock/entityCategory/entity/pdbx_mutation" p9 
    ON p9.docid = e.docid AND p9.pos BETWEEN e.pstart AND e.pend
LEFT OUTER JOIN "/datablock/entityCategory/entity/pdbx_number_of_molecules" p10 
    ON p10.docid = e.docid AND p10.pos BETWEEN e.pstart AND e.pend
LEFT OUTER JOIN "/datablock/entityCategory/entity/pdbx_parent_entity_id" p11 
    ON p11.docid = e.docid AND p11.pos BETWEEN e.pstart AND e.pend
LEFT OUTER JOIN "/datablock/entityCategory/entity/pdbx_target_id" p12 
    ON p12.docid = e.docid AND p12.pos BETWEEN e.pstart AND e.pend
LEFT OUTER JOIN "/datablock/entityCategory/entity/src_method" p13 
    ON p13.docid = e.docid AND p13.pos BETWEEN e.pstart AND e.pend
LEFT OUTER JOIN "/datablock/entityCategory/entity/type" p14 
    ON p14.docid = e.docid AND p14.pos BETWEEN e.pstart AND e.pend

[Retrieve biological unit information]

SELECT s.pdbid, p1.details, p1.oligomeric_count, p1.oligomeric_details
    , p2.assembly_id, p2.asym_id_list, p2.oper_expression
FROM brief_summary s
JOIN pdbx_struct_assembly p1 ON p1.docid = s.docid
JOIN pdbx_struct_assembly_gen p2 ON p2.docid = s.docid AND p1.id = p2.assembly_id
WHERE p1.details IS NOT NULL

NOTE

Here we use views for mmCIF categories pdbx_struct_assembly and pdbx_struct_assembly_gen which groups mmCIF items in each category element into one row.


[Retrieve biological unit information for PDB entry 1nov]

SELECT s.pdbid, p1.details, p1.oligomeric_count, p1.oligomeric_details
    , p2.assembly_id, p2.asym_id_list, p2.oper_expression
FROM brief_summary s
JOIN pdbx_struct_assembly p1 ON p1.docid = s.docid
JOIN pdbx_struct_assembly_gen p2 ON p2.docid = s.docid AND p1.id = p2.assembly_id
WHERE s.pdbid = '1nov'

[Result]

 1nov 
 details: 
complete icosahedral assembly oligomeric_count: 
360 oligomeric_details: 
360-MERIC assembly_id: 
1 asym_id_list: 
A,B,C,D,E,F oper_expression: 
(1-60) 1nov 
 details: 
icosahedral asymmetric unit assembly_id: 
2 asym_id_list: 
A,B,C,D,E,F oper_expression: 
1 1nov 
 details: 
icosahedral pentamer assembly_id: 
3 asym_id_list: 
A,B,C,D,E,F oper_expression: 
(1-5) 1nov 
 details: 
icosahedral 23 hexamer assembly_id: 
4 asym_id_list: 
A,B,C,D,E,F oper_expression: 
(1,2,6,10,23,24) 1nov 
 details: 
icosahedral asymmetric unit, std point frame assembly_id: 
PAU asym_id_list: 
A,B,C,D,E,F oper_expression: 
P

[Counting the total number of residues in entries]

WITH slen(docid, entity_id, len) AS
(SELECT docid, p.val, COUNT(*)
 FROM "//entity_poly_seq/@entity_id" p
 GROUP BY docid,p.val)
SELECT b.pdbid, SUM(e.pdbx_number_of_molecules * s.len)
FROM brief_summary b
JOIN entity e ON e.docid = b.docid
JOIN slen s ON s.docid = e.docid AND s.entity_id = e.id
GROUP BY b.pdbid

[Map between PDB entity and UniProt accession]

Using category view

SELECT DISTINCT s.pdbid, e.entity_id, e.db_code, e.pdbx_db_accession
, e.pdbx_db_primary_accession
FROM brief_summary s  
JOIN struct_ref e ON s.docid = e.docid 
WHERE db_name = 'UNP' AND entity_id IS NOT NULL

Using XPath-based tables

SELECT DISTINCT s.pdbid, p1.val AS entity_id, p3.val AS db_code, p4.val AS pdbx_db_accession
FROM brief_summary s  
JOIN "E://struct_ref" e ON e.docid = s.docid
JOIN "//struct_ref/entity_id" p1 
    ON p1.docid = e.docid AND p1.pos BETWEEN e.pstart AND e.pend
JOIN "//struct_ref/db_name" p2
    ON p2.docid = e.docid AND p2.pos BETWEEN e.pstart AND e.pend
LEFT OUTER JOIN "//struct_ref/db_code" p3
    ON p3.docid = e.docid AND p3.pos BETWEEN e.pstart AND e.pend
LEFT OUTER JOIN "//struct_ref/pdbx_db_accession" p4
    ON p4.docid = e.docid AND p4.pos BETWEEN e.pstart AND e.pend
LEFT OUTER JOIN "//struct_ref/pdbx_db_primary_accession" p5
    ON p5.docid = e.docid AND p5.pos BETWEEN e.pstart AND e.pend
WHERE p2.val = 'UNP'

[Some XPath expressions (for demonstration only)]

Simple example

Retrieving the entity element with id = 1 for PDBID 1gof:

SELECT xpath('/PDBx:datablock/PDBx:entityCategory/PDBx:entity[@id="1"]'
  ,  doc
  ,  ARRAY[ARRAY['PDBx', 'http://pdbj.protein.osaka-u.ac.jp/XML/pdbmlplus/pdbMLplus_v40.xsd' ]])
FROM xmldoc
WHERE pdbid = '1gof'

A little complex example (Very slow!)

This query as well as other queries using the XPath function are very slow in general. Please avoid using it.

Retrieving all PDB ID's whose resolution is less than 2.0 Angstrom:

SELECT xpath(
'/PDBx:datablock[PDBx:refineCategory/PDBx:refine/PDBx:ls_d_res_high<=2.0]/@datablockName'
  , doc
  , ARRAY[ARRAY['PDBx', 'http://pdbj.protein.osaka-u.ac.jp/XML/pdbmlplus/pdbMLplus_v40.xsd' ]])
FROM xmldoc
(last edited: 1 month ago)2013-11-29
Loading...
Loading