% This file was created with JabRef 2.4.2. % Encoding: UTF8 @ARTICLE{Bailey1982, author = {Bailey, T. A. and Dubes, R.}, title = {Cluster validity profiles}, journal = {Pattern Recognition}, year = {1982}, volume = {15}, pages = {61-83}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{VanMechelen2004, author = {Van Mechelen, I. and Bock, H. H. and De Boeck, P.}, title = {Two-mode clustering methods: a structured overview}, journal = {Statistical Methods in Medical Research}, year = {2004}, volume = {13}, pages = {363-394}, number = {5}, abstract = {In this paper we present a structured overview of methods for two-mode clustering, that is, methods that provide a simultaneous clustering of the rows and columns of a rectangular data matrix. Key structuring principles include the nature of row, column and data clusters and the type of model structure or associated loss function. We illustrate with analyses of symptom data on archetypal psychiatric patients.}, keywords = {ERROR-VARIANCE APPROACH; HIERARCHICAL CLASSES; PROXIMITY DATA; NODAL ANALYSIS; ALGORITHM; MODELS; BLOCKMODELS; STRATEGIES}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Tonidandel2004, author = {Tonidandel, S. and Overall, J. E.}, title = {Determining the number of clusters by sampling with replacement}, journal = {Psychological Methods}, year = {2004}, volume = {9}, pages = {238-249}, number = {2}, abstract = {A split-sample replication criterion originally proposed by J. E. Overall and K. N. Magee (1992) as a stopping rule for hierarchical cluster analysis is applied to multiple data sets generated by sampling with replacement from an original simulated primary data set. An investigation of the validity of this bootstrap procedure was undertaken using different combinations of the true number of latent populations, degrees of overlap, and sample sizes. The bootstrap procedure enhanced the accuracy of identifying the true number of latent populations under virtually all conditions. Increasing the size of the resampled data sets relative to the size of the primary data set further increased accuracy. A computer program to implement the bootstrap stopping rule is made available via a referenced Web site.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Todem2007, author = {Todem, D. and Kim, K.}, title = {Existence and uniqueness conditions for the maximum likelihood solution in regression models for correlated Bernoulli data.}, journal = {Journal of Mathematics and Statistics}, year = {2007}, volume = {3}, pages = {134-141}, number = {3}, abstract = {We give sufficient and necessary conditions for the existence of the maximum liklihood estimate in a class of multivariate regression models for correlated Bernoulli random variables. The models use the concept of threshold crossing technique of an underlying multivariate latent variable with univariate components formulated as a linear regression model. However, in place of their Gaussian assumptions, any specified distribution with a strictly increasing cumulative distribution function is allowed for error terms. A well known member of this class of models is the multivariate probit model. We show that our results are a generalization of the concepts of separation and overlap of Albert and Anderson for the study of the existence of maximum likelihood estimate in generalized linear models. Implications of our findings are illustrated through some hypothetical examples.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @INPROCEEDINGS{Tikka2007, author = {Tikka, J. and Hollmen, J. and Myllykangas, S.}, title = {Mixture modeling of DNA copy number amplification patterns in cancer}, booktitle = {9th International Work-Conference on Artificial Neural Networks}, year = {2007}, editor = {Sandoval, F. Prieto A. Cabestany J. Grana M.}, pages = {972-979}, address = {San Sebastian, SPAIN}, abstract = {DNA copy number amplifications are hallmarks of many cancers. In this work we analyzed data of genome-wide DNA copy number amplifications collected from more than 4500 neoplasm cases. Based on the 0-1 representation of the data, we trained finite mixtures of multivariate Bernoulli distributions using the EM algorithm to describe the inherent structure in the data. The resulting component distributions of the mixtures of Bernoulli distributions yielded plausible and localized amplification patterns. Individual amplification patterns were tested for their role in cancer groups formed with known risk associations. Our detailed analysis of chromosome I showed that asbestos-exposure related and hormonal imbalance-associated cancers were clustered and specific chromosome bands, 1p34 and 1q42, were identified. These sites contain cancer genes, which might explain the condition-specific selection of these loci for amplification.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Tibshirani2001, author = {Tibshirani, R. and Walther, G. and Hastie, T.}, title = {Estimating the number of clusters in a data set via the gap statistic}, journal = {Journal of the Royal Statistical Society Series B-Statistical Methodology}, year = {2001}, volume = {63}, pages = {411-423}, abstract = {We propose a method (the 'gap statistic') for estimating the number of clusters (groups) in a set of data. The technique uses the output of any clustering algorithm (e.g. K-means or hierarchical), comparing the change in within-cluster dispersion with that expected under an appropriate reference null distribution. Some theory is developed for the proposal and a simulation study shows that the gap statistic usually outperforms other methods that have been proposed in the literature.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Symons1981, author = {Symons, M. J.}, title = {Clustering criteria and multivariate normal mixtures}, journal = {Biometrics}, year = {1981}, volume = {37}, pages = {35-43}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Su2001, author = {Su, M. S. and Chou, C. H.}, title = {A modified version of the K-means algorithm with a distance based on cluster symmetry}, journal = {Ieee Transactions on Pattern Analysis and Machine Intelligence}, year = {2001}, volume = {23}, pages = {674-680}, number = {6}, abstract = {In this paper, we propose a modified version of the K-means algorithm to cluster data. The proposed algorithm adopts a novel nonmetric distance measure based on the idea of "point symmetry." This kind of "point symmetry distance" can be applied in data clustering and human face detection. Several data sets are used to illustrate its effectiveness.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Brusco2007a, author = {Brusco, M. J. and Steinley, D.}, title = {A comparison of heuristic procedures for minimum within-cluster sums of squares partitioning}, journal = {Psychometrika}, year = {2007}, volume = {72}, pages = {583-600}, number = {4}, abstract = {Perhaps the most common criterion for partitioning a data set is the minimization of the within-cluster sums of squared deviation from cluster centroids. Although optimal solution procedures for within-cluster sums of squares (WCSS) partitioning are computationally feasible for small data sets, heuristic procedures are required for most practical applications in the behavioral sciences. We compared the performances of nine prominent heuristic procedures for WCSS partitioning across 324 simulated data sets representative of a broad spectrum of test conditions. Performance comparisons focused on both percentage deviation from the "best-found" WCSS values, as well as recovery of true cluster structure. A real-coded genetic algorithm and variable neighborhood search heuristic were the most effective methods; however, a straightforward two-stage heuristic algorithm, HK-means, also yielded exceptional performance. A follow-up experiment using 13 empirical data sets from the clustering literature generally supported the results of the experiment using simulated data. Our findings have important implications for behavioral science researchers, whose theoretical conclusions could be adversely affected by poor algorithmic performances.}, keywords = {combinatorial data analysis; cluster analysis; heuristics; sum of squares criterion K-MEANS ALGORITHM; GENETIC ALGORITHM; BINARY DATA; VALIDATION; VARIABLES; SELECTION; CRITERIA; BRANCH; SETS}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Strauss1982, author = {Strauss, R. E.}, title = {Statistical significance of species clusters in association analysis}, journal = {Ecology}, year = {1982}, volume = {63}, pages = {634-639}, number = {3}, note = {Times Cited: 48}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Brusco2007, author = {Brusco, M. and Steinley, D.}, title = {A variable neighborhood search method for generalized blockmodeling of two-mode binary matrices}, journal = {Journal of Mathematical Psychology}, year = {2007}, volume = {51}, pages = {325-338}, number = {5}, abstract = {The clustering of two-mode proximity matrices is a challenging combinatorial optimization problem that has important applications in the quantitative social sciences. We focus on one particular type of problem related to the clustering of a two-mode binary matrix, which is relevant to the establishment of generalized blockmodels for social networks. In this context, clusters for the rows of the two-mode matrix intersect with clusters of the columns to form blocks, which should ideally be either complete (all Is) or null (all Os). A new procedure based on variable neighborhood search is presented and compared to an existing two-mode K-means clustering algorithm. The new procedure. generally provided slightly greater explained variation; however, both methods yielded exceptional recovery of cluster structure. (C) 2007 Elsevier Inc. All rights reserved.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Steinley2005, author = {Steinley, D. and Henson, R.}, title = {OCLUS: An analytic method for generating clusters with known overlap}, journal = {Journal of Classification}, year = {2005}, volume = {22}, pages = {221-250}, number = {2}, abstract = {The primary method for validating cluster analysis techniques is through Monte Carlo simulations that rely on generating data with known cluster structure (e.g., Milligan 1996). This paper defines two kinds of data generation mechanisms with cluster overlap, marginal and joint; current cluster generation methods are framed within these definitions. An algorithm generating overlapping clusters based on shared densities from several different multivariate distributions is proposed and shown to lead to an easily understandable notion of cluster overlap. Besides outlining the advantages of generating clusters within this framework, a discussion is given of how the proposed data generation technique can be used to augment research into current classification techniques such as finite mixture modeling, classification algorithm robustness, and latent profile analysis.}, keywords = {cluster generation; overlapping clusters ARTIFICIAL TEST CLUSTERS; MIXTURE MODEL TESTS; CLASSIFICATION CAPABILITIES; STOPPING RULES; INFORMATION; ALGORITHMS; DISTRIBUTIONS; VARIABLES; SIZE}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Bezdek1998, author = {Bezdek, J. C. and Pal, N. R.}, title = {Some new indexes of cluster validity}, journal = {Ieee Transactions on Systems Man and Cybernetics Part B-Cybernetics}, year = {1998}, volume = {28}, pages = {301-315}, number = {3}, abstract = {We review two clustering algorithms (hard c-means and single linkage) and three indexes of crisp cluster validity (Hubert's statistics, the Davies-Bouldin index, and Dunn's index). We illustrate two deficiencies of Dunn's index which make it overly sensitive to noisy clusters and propose several generalizations of it that are not as brittle to outliers in the clusters. Our numerical examples show that the standard measure of interset distance (the minimum distance between points in a pair of sets) is the worst (least reliable) measure upon which to base cluster validation indexes when the clusters are expected to form volumetric clouds. Experimental results also suggest that intercluster separation plays a more important role in cluster validation than cluster diameter. Our simulations show that while Dunn's original index has operational flaws, the concept it embodies provides a rich paradigm for validation of partitions that have cloud-like clusters. Five of our generalized Dunn's indexes provide the best validation results for the simulations presented.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Steinley2008, author = {Steinley, D. and Brusco, M. J.}, title = {A new variable weighting and selection procedure for {K}-means cluster analysis}, journal = {Multivariate Behavioral Research}, year = {2008}, volume = {43}, pages = {77-108}, number = {1}, abstract = {A variance-to-range ratio variable weighting procedure is proposed. We show how this weighting method is theoretically grounded in the inherent variability found in data exhibiting cluster structure. In addition, a variable selection procedure is proposed to operate in conjunction with the variable weighting technique. The performances of these procedures are demonstrated in a simulation study, showing favorable results when compared with existing standardization methods. A detailed demonstration of the weighting and selection procedure is provided for the well-known Fisher Iris data and several synthetic data sets.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Bertrand2006, author = {Bertrand, P. and Mufti, G. B.}, title = {Loevinger's measures of rule quality for assessing cluster stability}, journal = {Computational Statistics \& Data Analysis}, year = {2006}, volume = {50}, pages = {992-1015}, number = {4}, abstract = {A method is developed for measuring clustering stability under the removal of a few objects from a set of objects to be partitioned. Measures of stability of an individual cluster are defined as Loevinger's measures of rule quality. The stability of an individual cluster can be interpreted as a weighted mean of the inherent stabilities in the isolation and cohesion, respectively, of the examined cluster. The design of the method also enables us to measure the stability of a partition, that can be viewed as a weighted mean of the stability measures of all clusters in the partition. As a consequence, an approach is derived for determining the optimal number of clusters of a partition. Furthermore, using a Monte Carlo test, a significance probability is computed in order to assess how likely any stability measure is, under a null model that specifies the absence of cluster stability. In order to illustrate the potential of the method, stability measures that were obtained by using the batch K-Means algorithm on artificial data sets and on Iris Data are presented. (c) 2004 Elsevier B.V. All rights reserved.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Bayne1980, author = {Bayne, C. K. and Beauchamp, J. J. and Begovich, C. L. and Kane, V. E.}, title = {Monte-{C}arlo comparisons of selected clustering procedures}, journal = {Pattern Recognition}, year = {1980}, volume = {12}, pages = {51-62}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Steinley2008a, author = {Steinley, D. and Brusco, M. J.}, title = {Selection of variables in cluster analysis: {A}n empirical comparison of eight procedures}, journal = {Psychometrika}, year = {2008}, volume = {73}, pages = {125-144}, number = {1}, abstract = {Eight different variable selection techniques for model-based and non-model-based clustering are evaluated across a wide range of cluster structures. It is shown that several methods have difficulties when non-informative variables (i.e., random noise) are included in the model. Furthermore, the distribution of the random noise greatly impacts the performance of nearly all of the variable selection procedures. Overall, a variable selection technique based on a variance-to-range weighting procedure coupled with the largest decreases in within-cluster sums of squares error performed the best. On the other hand, variable selection methods used in conjunction with finite mixture models performed the worst.}, keywords = {cluster analysis; variable selection PROJECTION PURSUIT; LOCAL OPTIMA; ALGORITHM}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Steinley2007a, author = {Steinley, D. and Brusco, M. J.}, title = {Initializing {$K$}-means batch clustering: A critical evaluation of several techniques}, journal = {Journal of Classification}, year = {2007}, volume = {24}, pages = {99-121}, number = {1}, abstract = {K-means clustering is arguably the most popular technique for partitioning data. Unfortunately, K-means suffers from the well-known problem of locally optimal solutions. Furthermore, the final partition is dependent upon the initial configuration, making the choice of starting partitions all the more important. This paper evaluates 12 procedures proposed in the literature and provides recommendations for best practices.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Steinley2007, author = {Steinley, D.}, title = {Validating clusters with the lower bound for sum-of-squares error}, journal = {Psychometrika}, year = {2007}, volume = {72}, pages = {93-106}, number = {1}, abstract = {Given that a minor condition holds (e.g., the number of variables is greater than the number of clusters), a nontrivial lower bound for the sum-of-squares error criterion in K-means clustering is derived. By calculating the lower bound for several different situations, a method is developed to determine the adequacy of cluster solution based on the observed sum-of-squares error as compared to the minimum sum-of-squares error.}, keywords = {k-means; cluster analysis ARTIFICIAL TEST CLUSTERS; DATA SET; CLASSIFICATION; ALGORITHM; NUMBER}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Steinley2006, author = {Steinley, D.}, title = {Profiling local optima in {$K$}-means clustering: {D}eveloping a diagnostic techniques}, journal = {Psychological Methods}, year = {2006}, volume = {11}, pages = {178-192}, number = {2}, abstract = {Using the cluster generation procedure proposed by D. Steinley and R. Henson (2005), the author investigated the performance of K-means clustering under the following scenarios: (a) different probabilities of cluster overlap; (b) different types of cluster overlap; (c) varying samples sizes, clusters, and dimensions; (d) different multivariate distributions of clusters; and (e) various multidimensional data structures. The results are evaluated in terms of the Hubert-Arabie adjusted Rand index, and several observations concerning the performance of K-means clustering are made. Finally, the article concludes with the proposal of a diagnostic technique indicating when the partitioning given by a K-means cluster analysis can be trusted. By combining the information from several observable characteristics of the data (number of clusters, number of variables, sample size, etc.) with the prevalence of unique local optima in several thousand implementations of the K-means algorithm, the author provides a method capable of guiding key data-analysis decisions.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Steinley2006a, author = {Steinley, D.}, title = {{$K$}-means clustering: {A} half-century synthesis}, journal = {British Journal of Mathematical \& Statistical Psychology}, year = {2006}, volume = {59}, pages = {1-34}, abstract = {This paper synthesizes the results, methodology, and research conducted concerning the K-means clustering method over the last fifty years. The K-means method is first introduced, various formulations of the minimum variance loss function and alternative loss functions within the same class are outlined, and different methods of choosing the number of clusters and initialization, variable preprocessing, and data reduction schemes are discussed. Theoretic statistical results are provided and various extensions of K-means using different metrics or modifications of the original algorithm are given, leading to a unifying treatment of K-means and some of its extensions. Finally, several future studies are outlined that could enhance the understanding of numerous subtleties affecting the performance of the K-means method.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Steinley2004, author = {Steinley, D.}, title = {Properties of the {H}ubert-{A}rabie adjusted {R}and index}, journal = {Psychological Methods}, year = {2004}, volume = {9}, pages = {386-396}, number = {3}, abstract = {This article provides an investigation of cluster validation indices that relates 4 of the indices to the L. Hubert and P. Arabie (1985) adjusted Rand index-the cluster validation measure of choice (G. W. Milligan & M. C. Cooper, 1986). It is shown how these other indices can be "roughly" transformed into the same scale as the adjusted Rand index. Furthermore, in-depth explanations are given of why classification rates should not be used in cluster validation research. The article concludes by summarizing several properties of the adjusted Rand index across many conditions and provides a method for testing the significance of observed adjusted Rand indices.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Steinley2004a, author = {Steinley, D.}, title = {Standardizing variables in {$K$}-means clustering}, journal = {Classification, Clustering, and Data Mining Applications}, year = {2004}, pages = {53-60}, note = {Banks, D House, L McMorris, FR Arabie, P Gaul, W Meeting of the International-Federation-of-Classifications-Societies (IFCS) JUL 15-18, 2004 Illinois Inst Technol, Chicago, IL}, abstract = {Several standardization methods are investigated in conjunction with the K-means algorithm under various conditions. We find that traditional standardization methods (i.e., z-scores) are inferior to alternative standardization methods. Future suggestions concerning the combination of standardization and variable selection are considered.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Steinley2003, author = {Steinley, D.}, title = {Local optima in {$K$}-means clustering: {W}hat you don't know may hurt you}, journal = {Psychological Methods}, year = {2003}, volume = {8}, pages = {294-304}, number = {3}, abstract = {The popular K-means clustering method, as implemented in 3 commercial software packages (SPSS, SYSTAT, and SAS), generally provides solutions that are only locally optimal for a given set of data. Because none of these commercial implementations offer a reasonable mechanism to begin the K-means method at alternative starting points, separate routines were written within the MATLAB (MathWorks, 1999) environment that can be initialized randomly (these routines are provided at the end of the online version of this article in the PsycARTICLES database). Through the analysis of 2 empirical data sets and 8 10 simulated data sets, it is shown that the results provided by commercial packages are most likely locally optimal. These results suggest the need for some strategy to study the local optima problem for a specific data set or to identify methods for finding "good" starting values that might lead to the best solutions possible.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @INPROCEEDINGS{Song2006, author = {Song, S. and Li, C. P.}, title = {Improved ROCK for text clustering using asymmetric proximity}, booktitle = {32nd Conference on Current Trends in Theory and Practice of Computer Science}, year = {2006}, editor = {Wiedermann, J. Tel G. Pokorny J. Bielikova M. Stuller J.}, pages = {501-510}, address = {Merin, CZECH REPUBLIC}, abstract = {The ROCK algorithm can be applied to text clustering in large databases. The effectiveness of ROCK, however, is limited, because of the high dimensionality of textual data and traditional proximity measure of documents. In this paper, we propose an improved approach to strengthen the discriminative feature of text documents, which uses asymmetric proximity. Instead of the links count in ROCK, we propose a novel concept of link weight overlaps to measure the proximity between two clusters. The IROCK (Improved ROCK) algorithm performs clustering analysis based on the overlap information of asymmetric proximities between text objects. We carry on the clustering process in an agglomerative hierarchical way. To demonstrate the effectiveness of IROCK, we perform an experimental evaluation on real textual data. A comparison with ROCK and classical algorithms indicates the superiority of our approach.}, keywords = {data mining; text clustering}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Snijders1991, author = {Snijders, T. A. B.}, title = {Enumeration and simulation methods for 0-1 matrices with given marginals}, journal = {Psychometrika}, year = {1991}, volume = {56}, pages = {397-417}, number = {3}, note = {Times Cited: 38}, abstract = {Data in the form of zero-one matrices where conditioning on the marginals is relevant arise in diverse fields such as social networks and ecology; directed graphs constitute an important special case. An algorithm is given for the complete enumeration of the family of all zero-one matrices with given marginals and with a prespecified set of cells with structural zero entries. Complete enumeration is computationally feasible only for relatively small matrices. Therefore, a more useable Monte Carlo simulation method for the uniform distribution over this family is given, based on unequal probability sampling and ratio estimation. This method is applied to testing reciprocity of choices in social networks.}, keywords = {ADJACENCY MATRICES; RANDOM DIGRAPHS, NETWORKS; ECOLOGY; MONTE-CARLO METHODS; UNEQUAL PROBABILITY SAMPLING; RECIPROCITY DISTRIBUTIONS; NETWORKS; CENSUS}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Shatovska2008, author = {Shatovska}, title = {The new software package for dynamic hierarchical clustering for circles types of shapes}, year = {2008}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @INPROCEEDINGS{Seppanen2003, author = {Seppanen, J. K. and Bingham, E. and Mannila, H.}, title = {A simple algorithm for topic identification in 0-1 data}, booktitle = {7th European Conference on Principles and Practice of Knowledge Discovery in Databases}, year = {2003}, editor = {Lavrac, N. Gamberger D. Todorovski L. Blockeel H.}, pages = {423-434}, address = {Cavtat, Croatia}, abstract = {Topics in 0-1 datasets are sets of variables whose occurrences are positively connected together. Earlier, we described a simple generative topic model. In this paper we show that, given data produced by this model, the lift statistics of attributes can be described in matrix form. We use this result to obtain a simple algorithm for finding topics in 0-1 data. We also show that a problem related to the identification of topics is NP-hard. We give experimental results on the topic identification problem, both on generated and real data.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Scott1999, author = {Scott, P. D. and Wilkins, E.}, title = {Evaluating data mining procedures: techniques for generating artificial data sets}, journal = {Information and Software Technology}, year = {1999}, volume = {41}, pages = {579-587}, number = {9}, abstract = {In this article, we discuss the need to evaluate the performance of data mining procedures and argue that tests done with real data sets cannot provide all the information needed for a thorough assessment of their performance characteristics. We argue that artificial data sets are therefore essential. After a discussion of the desirable characteristics of such artificial data, we describe two pseudo-random generators. The first is based on the multi-variate normal distribution and gives the investigator full control of the degree of correlation between the variables in the artificial data sets. The second is inspired by fractal techniques for synthesizing artificial landscapes and can produce data whose classification complexity can be controlled by a single parameter. We conclude with a discussion of the additional work necessary to achieve the ultimate goal of a method of matching data sets to the most appropriate data mining technique. (C) 1999 Elsevier Science B.V. All rights reserved.}, keywords = {data mining; artificial data sets; pseudo-random generators}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Scott1971, author = {Scott, A. J. and Symons, M. J.}, title = {Clustering methods based on likelihood ratio criteria}, journal = {Biometrics}, year = {1971}, volume = {27}, pages = {387-\&}, number = {2}, note = {Times Cited: 111}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Savicky2006, author = {Savicky}, title = {Problems with Matlab RNG}, year = {2006}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Real1996, author = {Real, R. and Vargas, J. M.}, title = {The probabilistic basis of Jaccard's index of similarity}, journal = {Systematic Biology}, year = {1996}, volume = {45}, pages = {380-385}, number = {3}, note = {Times Cited: 15}, keywords = {SPECIES CO-OCCURRENCES; NULL MODELS; INTERSPECIFIC COMPETITION; OCCURRENCE PATTERNS; BINARY DATA; BIOGEOGRAPHY; ISLANDS; COMMUNITIES}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Qiu2006, author = {Qiu, W. L. and Joe, H.}, title = {Generation of random clusters with specified degree of separation}, journal = {Journal of Classification}, year = {2006}, volume = {23}, pages = {315-334}, number = {2}, abstract = {We propose a random cluster generation algorithm that has the desired features: (1) the population degree of separation between clusters and the nearest neighboring clusters can be set to a specified value, based on a separation index; (2) no constraint is imposed on the isolation among clusters in each dimension; (3) the covariance matrices correspond to different shapes, diameters and orientations; (4) the full cluster structures generally could not be detected simply from pair-wise scatterplots of variables; (5) noisy variables and outliers can be imposed to make the cluster structures harder to be recovered. This algorithm is an improvement on the method used in Milligan (1985).}, keywords = {cluster generation; separation index; factorial experiment design ARTIFICIAL TEST CLUSTERS; ALGORITHM; VARIABLES; SELECTION; INDEX}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Potvin1993, author = {Potvin, C. and Roff, D. A.}, title = {Distribution-free and robust statistical-methods - viable alternatives to parametric statistics}, journal = {Ecology}, year = {1993}, volume = {74}, pages = {1617-1628}, number = {6}, note = {Times Cited: 255}, abstract = {After making a case for the prevalence of nonnormality, this paper attempts to introduce some distribution-free and robust techniques to ecologists and to offer a critical appraisal of the potential advantages and drawbacks of these methods. The techniques presented fall into two distinct categories, methods based on ranks and ''computer-intensive'' techniques. Distribution-free rank tests have features that can be recommended. They free the practitioner from concern about the underlying distribution and are very robust to outliers. If the distribution underlying the observations is other than normal, rank tests tend to be more efficient than their parametric counterparts. The absence, in computing packages, of rank procedures for complex designs may, however, severely limit their use for ecological data. An entire body of novel distribution-free methods has been developed in parallel with the increasing capacities of today's computers to process large quantities of data. These techniques either reshuffle or resample a data set (i.e., sample with replacement) in order to perform their analyses. The former we shall refer to as ''permutation'' or ''randomization'' methods and the latter as ''bootstrap'' techniques. These computer-intensive methods provide new alternatives for the problem of a small and/or unbalanced data set, and they may be the solution for parameter estimation when the sampling distribution cannot be derived analytically. Caution must be exercised in the interpretation of these estimates because confidence limits may be too small.}, keywords = {POPULATION-GROWTH RATES; NONPARAMETRIC STATISTICS; MITOCHONDRIAL-DNA; ANIMAL NUMBERS; WILLIAMS-TEST; MONTE-CARLO; MANTEL TEST; DOSE LEVELS; REGRESSION; DESIGNS}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Park1996, author = {Park, C. G. and Park, T. and Shin, D. W.}, title = {A simple method for generating correlated binary variates}, journal = {American Statistician}, year = {1996}, volume = {50}, pages = {306-310}, number = {4}, abstract = {Correlated binary data are frequently analyzed in studies of repeated measurements, reliability analysis, and others. In such studies correlations among binary variables are usually nonnegative. This article provides a simple algorithm for generating an arbitrary dimensional random vector of nonnegatively correlated binary variables, Ln some frequently encountered situations the algorithm reduces to explicit expressions. The correlated binary variables are generated from correlated Poisson variables, The key idea lies in the property that any Poisson random variable can be expressed as a convolution of other independent Poisson random variables, The binary variables have desired correlations by sharing common independent Poisson variables.}, keywords = {generalized estimating equations; Poisson variables; random number generation LONGITUDINAL DATA; MODELS}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Panayirci1983, author = {Panayirci, E. and Dubes, R. C.}, title = {A test for multidimensional clustering tendency}, journal = {Pattern Recognition}, year = {1983}, volume = {16}, pages = {433-444}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Joe2006, author = {Joe, H.}, title = {Generating random correlation matrices based on partial correlations}, journal = {Journal of Multivariate Analysis}, year = {2006}, volume = {97}, pages = {2177-2189}, number = {10}, abstract = {A d-dimensional positive definite correlation matrix R = (rho(ij)) can be parametrized in terms of the correlations rho(i,i+1) for i = 1,..., d - 1, and the partial correlations rho(ij\i+1,.... j-1) for j - i >= 2. These ((d)(2)) parameters can independently take values in the interval (- 1, 1). Hence we can generate a random positive definite correlation matrix by choosing independent distributions F-ij, 1 <= i < j <= d, for these ((d)(2)) parameters. We obtain conditions on the F-ij so that the joint density of (rho(ij)) is proportional to a power of det(R) and hence independent of the order of indices defining the sequence of partial correlations. As a special case, we have a simple construction for generating R that is uniform over the space of positive definite correlation matrices. As a byproduct, we determine the volume of the set of correlation matrices in ((d)(2))-dimensional space. To prove our results, we obtain a simple remarkable identity which expresses det(R) as a function of rho(i,i+1) for i = 1,..., d - 1, and p(ij\i+1,... j-1) for j - i >= 2. (C) 2005 Elsevier Inc. All rights reserved.}, keywords = {beta distribution; determinant of correlation matrix}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @INPROCEEDINGS{Joe2004, author = {Joe, H.}, title = {Range of correlation matrices for dependent random variables with given marginal distributions}, booktitle = {International Conference on Distribution Theory, Order Statistics, and Inference}, year = {2004}, editor = {Balakrishnan, N. Castillo E. Sarabia J. M.}, pages = {125-142}, address = {Santander, SPAIN}, abstract = {Let X-1, center dot center dot center dot, X-d be d (d >= 3) dependent random variables with finite variances such that X-j similar to F-j. Results on the set S-d(F-1, center dot center dot center dot, F-d) of possible correlation matrices with given margins are obtained; this set is relevant for simulating dependent random variables with given marginal distributions and a given correlation matrix. When F-1 = (...) = F-d = F, we let S-d(F) denote the set of possible correlation matrices. Of interest is the set of F for which Sd(F) is the same as the set of all non-negative definite correlation matrices; using a construction with conditional distributions, we show that this property holds only if F is a (location-scale shift of a) margin of a (d-1)-dimensional spherical distribution.}, keywords = {spherically symmetric; elliptically contoured; copula; partial correlation; Frechet bounds}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Jaksic1990, author = {Jaksic, F. M. and Medel, R. G.}, title = {Objective recognition of guilds - testing for statistically significant species clusters}, journal = {Oecologia}, year = {1990}, volume = {82}, pages = {87-92}, number = {1}, note = {Times Cited: 52}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Jain1987, author = {Jain, A. K. and Moreau, J. V.}, title = {Bootstrap technique in cluster-analysis}, journal = {Pattern Recognition}, year = {1987}, volume = {20}, pages = {547-568}, number = {5}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Jackson1992, author = {Jackson, D. A. and Somers, K. M. and Harvey, H. H.}, title = {Null models and fish communities - evidence of nonrandom patterns}, journal = {American Naturalist}, year = {1992}, volume = {139}, pages = {930-951}, number = {5}, note = {Times Cited: 62}, abstract = {Studies of ecological communities often make implicit assumptions that the species have nonrandom patterns organized through biotic and abiotic factors. Although such assumptions are generally not tested, the analyses and conclusions derived depend on nonrandom patterns' being present. Several "null" or "neutral" models have been proposed to test for these patterns. We contrast two of the more prevalent models and develop two new models, subsequently evaluating them with five sets of fish community data. Three of the null models provide similar results, from which it is concluded that fish communities from five regions of Ontario are nonrandomly structured. These three models evaluate pairs of species according to departures from null or random co-occurrence expectations, A Monte Carlo model based on the procedure proposed by E. F. Connor and D. Simberloff supports random community organization, but we attribute this discrepancy to the conservative pooling of species-pair information in that model. We recommend a hybrid model combining Monte Carlo and log-linear methods for future studies, although the log-linear model of M. E. Gilpin and J. M. Diamond provides a reasonable approximation. On the basis of species associations derived from the various models, we attribute much of the nonrandom structure to common habitat requirements among co-occurring species. A predominance of positively associated species generally involves pairs of species with similar ecological characteristics. Strong negative associations typically involve predator-prey species. Although competition is often identified as a significant feature in community ecology, we do not believe that competition is a major force structuring these fish communities.}, keywords = {SPECIES CO-OCCURRENCES; INTERSPECIFIC COMPETITION; LAKES; ASSEMBLAGES; SIMILARITY; ISLANDS; STOCHASTICITY; ASSOCIATION; DIVERSITY; ECOLOGY}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Jackson1989, author = {Jackson, D. A. and Somers, K. M. and Harvey, H. H.}, title = {Similarity coefficients - measures of co-occurrence and association or simply measures of occurrence}, journal = {American Naturalist}, year = {1989}, volume = {133}, pages = {436-453}, number = {3}, note = {Times Cited: 88}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Irigoien2008, author = {Irigoien, I. and Arenas, C.}, title = {INCA: New statistic for estimating the number of clusters and identifying atypical units}, journal = {Statistics in Medicine}, year = {2008}, volume = {27}, pages = {2948-2973}, number = {15}, abstract = {This paper presents a solution to two problems that arise in the classification of data such as types of tumor, samples of gene expression profiles or general biomedical data. First, to estimate the real number of clusters in a data set and second to decide whether a new unit belongs to one of these previously identified clusters or it is an outlier or atypical unit. We propose a new statistic which allows us to solve these problems. As our approach is based on a measure of distance or dissimilarity between any pair of units, it can be applied to any kind of multivariate data (continuous, binary or multi-attribute data) and it has applications in many biomedical fields. We validated the approach in simulated examples and applied it to the diagnosis of dermal diseases and to the analysis of lymphatic cancer data, showing the good performance of our approach. Copyright (C) 2007 John Wiley & Sons, Ltd.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Hunter2004, author = {Hunter, J. C. and McCoy, R. A.}, title = {Applying randomization tests to cluster analyses}, journal = {Journal of Vegetation Science}, year = {2004}, volume = {15}, pages = {135-138}, number = {1}, note = {Times Cited: 0}, abstract = {In applying randomization tests to hierarchical cluster analyses, we have noted a potentially misleading result: within a significant group. linkages are often identified as significant even when species are randomly distributed among the group's sites. We demonstrate this through a cluster analysis of a constructed matrix with two groups of 20 sites that share no species, and within each group species are randomly distributed among sites. A randomization test identified both of the groups and all linkages within them as significant, while the same test found all linkages non-significant in the cluster analysis of a matrix containing just one of the two groups of 20 sites. In general, a non-random distribution of species within a data set shortens linkages relative to distances in null distributions derived from randomized versions of the data. This confounds efforts to identify significant sub-groups within a significant group. However. the significance of sub-groups possibly could be tested by comparing linkage distances to a null distribution derived from the randomization and clustering of a sub-matrix containing only the sites within the larger group. In essence, this comparison tests the null hypothesis that within the significant group. sites represent random assemblages of species. When applied to actual data sets. an approach involving sequential randomization tests could allow the evaluation of all nodes in a classification. increasing, the utility of randomization tests and strengthening the interpretation of groups produced by cluster analysis.}, keywords = {classification; null distribution; permutation test; resampling; vegetation SPECIES CLUSTERS}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Hubert1976a, author = {Hubert, L. J. and Levin, J. R.}, title = {General statistical framework for assessing categorical clustering in free-recall}, journal = {Psychological Bulletin}, year = {1976}, volume = {83}, pages = {1072-1080}, number = {6}, note = {Times Cited: 52}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Hubert1976, author = {Hubert, L. and Schultz, J.}, title = {Quadratic assignment as a general data-analysis strategy}, journal = {British Journal of Mathematical \& Statistical Psychology}, year = {1976}, volume = {29}, pages = {190-241}, number = {NOV}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Orloci1967, author = {Orloci, L.}, title = {An agglomerative method for classification of plant communities}, journal = {Journal of Ecology}, year = {1967}, volume = {55}, pages = {193-\&}, number = {1}, note = {Times Cited: 244}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Baroni-Urbani1976, author = {Baroni-Urbani, Cesare and Buser, Mauro W.}, title = {Similarity of Binary Data}, journal = {Systematic Zoology}, year = {1976}, volume = {25}, pages = {251--259}, number = {3}, abstract = {A set of intuitively obvious properties of a coefficient of similarity for binary data is established. Critical examination of the coefficients available from the literature shows that none of them satisfies all these properties, and a new coefficient is proposed to obviate this inconvenience. The distribution of the new coefficient is studied on very large, perfectly random samples of OTU's, and a table is constructed to show its critical values at different significance levels. A mathematical appendix is given to demonstrate the procedure used to construct perfectly random samples of different size.}, copyright = {Copyright © 1976 Society of Systematic Biologists}, issn = {00397989}, jstor_articletype = {primary_article}, jstor_formatteddate = {Sep., 1976}, owner = {rebeccaspeckman}, publisher = {Taylor \& Francis, Ltd. for the Society of Systematic Biologists}, timestamp = {2009.03.03} } @ARTICLE{Nemec1988, author = {Nemec, A. F. L. and Brinkhurst, R. O.}, title = {Using the bootstrap to assess statistical significance in the cluster-analysis of species abundance data}, journal = {Canadian Journal of Fisheries and Aquatic Sciences}, year = {1988}, volume = {45}, pages = {965-970}, number = {6}, note = {Times Cited: 31}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Mojena1977, author = {Mojena, R.}, title = {Hierarchical grouping methods and stopping rules: an evaluation}, journal = {Computer Journal}, year = {1977}, volume = {20}, pages = {359-363}, number = {4}, note = {Times Cited: 207}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Milligan1983, author = {Milligan, G. W. and Soon, S. C. and Sokol, L. M.}, title = {The effect of cluster size, dimensionality, and the number of clusters on recovery of true cluster structure}, journal = {Ieee Transactions on Pattern Analysis and Machine Intelligence}, year = {1983}, volume = {5}, pages = {40-47}, number = {1}, note = {Times Cited: 57}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Milligan1987, author = {Milligan, G. W. and Cooper, M. C.}, title = {Methodology review - clustering methods}, journal = {Applied Psychological Measurement}, year = {1987}, volume = {11}, pages = {329-354}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Milligan1985, author = {Milligan, G. W. and Cooper, M. C.}, title = {An examination of procedures for determining the number of clusters in a data set}, journal = {Psychometrika}, year = {1985}, volume = {50}, pages = {159-179}, number = {2}, note = {I have hard copy}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Milligan1981, author = {Milligan, G. W.}, title = {A {M}onte-{C}arlo study of 30 internal criterion measures for cluster analysis}, journal = {Psychometrika}, year = {1981}, volume = {46}, pages = {187-199}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Milligan1981a, author = {Milligan, G. W.}, title = {A review of {M}onte-{C}arlo tests of cluster analysis}, journal = {Multivariate Behavioral Research}, year = {1981}, volume = {16}, pages = {379-407}, number = {3}, note = {Times Cited: 114}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{McShane2002, author = {McShane, L. M. and Radmacher, M. D. and Freidlin, B. and Yu, R. and Li, M. C. and Simon, R.}, title = {Methods for assessing reproducibility of clustering patterns observed in analyses of microarray data}, journal = {Bioinformatics}, year = {2002}, volume = {18}, pages = {1462-1469}, number = {11}, note = {I have hard copy}, abstract = {Motivation: Recent technological advances such as cDNA microarray technology have made it possible to simultaneously interrogate thousands of genes in a biological specimen. A cDNA microarray experiment produces a gene expression 'profile'. Often interest lies in discovering novel subgroupings, or 'clusters', of specimens based on their profiles, for example identification of new tumor taxonomies. Cluster analysis techniques such as hierarchical clustering and self-organizing maps have frequently been used for investigating structure in microarray data. However, clustering algorithms always detect clusters, even on random data, and it is easy to misinterpret the results without some objective measure of the reproducibility of the clusters. Results: We present statistical methods for testing for overall clustering of gene expression profiles, and we define easily interpretable measures of cluster-specific reproducibility that facilitate understanding of the clustering structure. We apply these methods to elucidate structure in cDNA microarray gene expression profiles obtained on melanoma tumors and on prostate specimens.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Huang1998, author = {Huang, Z. X.}, title = {Extensions to the $k$-{M}eans algorithm for clustering large data sets with categorical values}, journal = {Data Mining and Knowledge Discovery}, year = {1998}, volume = {2}, pages = {283-304}, number = {3}, abstract = {The k-means algorithm is well known for its efficiency in clustering large data sets. However, working only on numeric values prohibits it from being used to cluster real world data containing categorical values. In this paper we present two algorithms which extend the k-means algorithm to categorical domains and domains with mixed numeric and categorical values. The k-modes algorithm uses a simple matching dissimilarity measure to deal with categorical objects, replaces the means of clusters with modes, and uses a frequency-based method to update modes in the clustering process to minimise the clustering cost function. With these extensions the k-modes algorithm enables the clustering of categorical data in a fashion similar to k-means. The k-prototypes algorithm, through the definition of a combined dissimilarity measure, further integrates the k-means and k-modes algorithms to allow for clustering objects described by mixed numeric and categorical attributes. We use the well known soybean disease and credit approval data sets to demonstrate the clustering performance of the two algorithms. Our experiments on two real world data sets with half a million objects each show that the two algorithms are efficient when clustering large data sets, which is critical to data mining applications.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{He2008, author = {He, Z. Y. and Xu, X. F. and Deng, S. C.}, title = {k-ANMI: A mutual information based clustering algorithm for categorical data}, journal = {Information Fusion}, year = {2008}, volume = {9}, pages = {223-233}, number = {2}, abstract = {Clustering categorical data is an integral part of data mining and has attracted much attention recently. In this paper, we present k-ANMI, a new efficient algorithm for clustering categorical data. The k-ANMI algorithm works in a way that is similar to the popular k-means algorithm, and the goodness of clustering in each step is evaluated using a mutual information based criterion (namely, average normalized mutual information-ANMI) borrowed from cluster ensemble. This algorithm is easy to implement, requiring multiple hash tables as the only major data structure. Experimental results on real datasets show that k-ANMI algorithm is competitive with those state-of-the-art categorical data clustering algorithms with respect to clustering accuracy. (c) 2006 Elsevier B.V. All rights reserved.}, keywords = {clustering; categorical data; mutual information; cluster ensemble; data mining DISSIMILARITY MEASURE; SYMBOLIC PATTERNS; DATA SETS}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{He2005, author = {He, Z. Y. and Deng, S. C. and Xu, X. F.}, title = {Improving K-modes algorithm considering frequencies of attribute values in mode}, journal = {Computational Intelligence and Security, Pt 1, Proceedings}, year = {2005}, volume = {3801}, pages = {157-162}, note = {International Conference on Computational Intelligence and Security DEC 15-19, 2005 Xi'an, PEOPLES R CHINA}, abstract = {In this paper, we present an experimental study on applying a new dissimilarity measure to the k-modes clustering algorithm to improve its clustering accuracy. The measure is based on the idea that the similarity between a data object and cluster mode, is directly proportional to the sum of relative frequencies of the common values in mode. Experimental results on real life datasets show that, the modified algorithm is superior to the original k-modes algorithm with respect to clustering accuracy.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Handl2005, author = {Handl, J. and Knowles, J. and Kell, D. B.}, title = {Computational cluster validation in post-genomic data analysis}, journal = {Bioinformatics}, year = {2005}, volume = {21}, pages = {3201-3212}, number = {15}, abstract = {Motivation: The discovery of novel biological knowledge from the ab initio analysis of post-genomic data relies upon the use of unsupervised processing methods, in particular clustering techniques. Much recent research in bioinformatics has therefore been focused on the transfer of clustering methods introduced in other scientific fields and on the development of novel algorithms specifically designed to tackle the challenges posed by post-genomic data. The partitions returned by a clustering algorithm are commonly validated using visual inspection and concordance with prior biological knowledge-whether the clusters actually correspond to the real structure in the data is somewhat less frequently considered. Suitable computational cluster validation techniques are available in the general data-mining literature, but have been given only a fraction of the same attention in bioinformatics. Results: This review paper aims to familiarize the reader with the battery of techniques available for the validation of clustering results, with a particular focus on their application to post-genomic data analysis. Synthetic and real biological datasets are used to demonstrate the benefits, and also some of the perils, of analytical clustervalidation. Availability: The software used in the experiments is available at http://dbkweb.ch.umist.ac.uk/handl/clustervalidation/ Contact: J.Handl@postgrad.manchester.ac.uk Supplementary information: Enlarged colour plots are provided in the Supplementary Material, which is available at http://dbkweb.ch.umist.ac.uk/handl/clustervalidation/}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Halkidi, author = {Halkidi}, title = {clustering validation review-- no date or journal -- they have stuff in journals}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Guvenir1998, author = {Guvenir, H. A. and Demiroz, G. and Ilter, N.}, title = {Learning differential diagnosis of erythemato-squamous diseases using voting feature intervals}, journal = {Artificial Intelligence in Medicine}, year = {1998}, volume = {13}, pages = {147-165}, number = {3}, abstract = {A new classification algorithm, called VFI5 (for Voting Feature Intervals), is developed and applied to problem of differential diagnosis of erythemato-squamous diseases. The domain contains records of patients with known diagnosis. Given a training set of such records, the VFI5 classifier learns how to differentiate a new case in the domain. VFI5 represents a concept in the form of feature intervals on each feature dimension separately. classification in the VFI5 algorithm is based on a real-valued voting. Each feature equally participates in the voting process and the class that receives the maximum amount of votes is declared to be the predicted class. The performance of the VFI5 classifier is evaluated empirically in terms of classification accuracy and running time. (C) 1998 Elsevier Science B.V. All rights reserved.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Guha2000, author = {Guha, S. and Rastogi, R. and Shim, K.}, title = {Rock: A robust clustering algorithm for categorical attributes}, journal = {Information Systems}, year = {2000}, volume = {25}, pages = {345-366}, number = {5}, abstract = {Clustering, in data mining, is useful to discover distribution patterns in the underlying data. Clustering algorithms usually employ a distance metric based (e.g., euclidean) similarity measure in order to partition the database such that data points in the same partition are more similar than points in different partitions. In this paper, we study clustering algorithms for data with boolean and categorical attributes. We show that traditional clustering algorithms that use distances between points for clustering are not appropriate for boolean and categorical attributes. Instead, we propose a novel concept of links to measure the similarity/proximity between a pair of data points. We develop a robust hierarchical clustering algorithm ROCK that employs links and not distances when merging clusters. Our methods naturally extend to non-metric similarity measures that are relevant in situations where a domain expert/similarity table is the only source of knowledge. In addition to presenting detailed complexity results for ROCK, we also conduct an experimental study with real-life as well as synthetic data sets to demonstrate the effectiveness of our techniques. For data with categorical attributes, our findings indicate that ROCK not only generates better quality clusters than traditional algorithms, but it also exhibits good scalability properties. (C) 2000 Published by Elsevier Science Ltd.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @INPROCEEDINGS{Grim2007, author = {Grim, J. and Hora, J.}, title = {Minimum information loss cluster analysis for categorical data}, booktitle = {5th International Conference on Machine Learning and Data Mining in Pattern Recognition}, year = {2007}, editor = {Perner, P.}, pages = {233-247}, address = {Leipzig, GERMANY}, abstract = {The EM algorithm has been used repeatedly to identify latent classes in categorical data by estimating finite distribution mixtures of product components. Unfortunately, the underlying mixtures are not uniquely identifiable and, moreover, the estimated mixture parameters are starting-point dependent. For this reason we use the latent class model only to define a set of "elementary" classes by estimating a mixture of a large number components. We propose a hierarchical "bottom up" cluster analysis based on unifying the elementary latent classes sequentially. The clustering procedure is controlled by minimum information loss criterion.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @INPROCEEDINGS{Grim2006, author = {Grim, J.}, title = {EM cluster analysis for categorical data}, booktitle = {Joint International Workshop on Structural, Syntactic, and Statistical Pattern Recognition}, year = {2006}, editor = {Yeung, D. Y. Kwok J. T. Fred A. Roli F. DeRidder D.}, pages = {640-648}, address = {Hong Kong, PEOPLES R CHINA}, abstract = {Distribution mixtures with product components have been applied repeatedly to determine clusters in multivariate data. Unfortunately, for categorical variables the mixture parameters are not uniquely identifiable and therefore the result of cluster analysis may become questionable. We give a simple proof that any non-degenerate discrete product mixture can be equivalently described by infinitely many different parameter sets. Nevertheless a unique result of cluster analysis can be guaranteed by additional constraints. We propose a heuristic method of sequential estimation of components to guarantee a unique identification of clusters by means of EM algorithm. The application of the method is illustrated by a numerical example.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Gowda1992, author = {Gowda, K. C. and Diday, E.}, title = {Symbolic clustering using a new similarity measure}, journal = {Ieee Transactions on Systems Man and Cybernetics}, year = {1992}, volume = {22}, pages = {368-378}, number = {2}, abstract = {A hierarchical, agglomerative, symbolic clustering methodology based on a new similarity measure that takes into consideration the "position," "span," and "content," of symbolic objects is proposed. The similarity measure used is of a new type in the sense that it is not just another aspect of dissimilarity such as the reciprocal of a distance measure. The clustering methodology forms composite symbolic objects using a Cartesian join operator when two symbolic objects are merged. The maximum and minimum similarity values at various merging levels enable the determination of the number of clusters in the data set. The composite symbolic objects representing different clusters give a description of the resulting classes and lead to knowledge acquisition. The algorithm appears very versatile as it is capable of discerning clusters in data sets made up of numeric as well as symbolic objects consisting of different types and combinations of qualitative and quantitative feature values. In particular, the algorithm is applied on two data sets of fat-oil and microcomputers.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{McRae1971, author = {McRae, D. J.}, title = {MIKCA - Fortran-IV iterative K-means cluster analysis program}, journal = {Behavioral Science}, year = {1971}, volume = {16}, pages = {423-\&}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{McLachlan2004, author = {McLachlan, G. J. and Chang, S. U.}, title = {Mixture modelling for cluster analysis}, journal = {Statistical Methods in Medical Research}, year = {2004}, volume = {13}, pages = {347-361}, number = {5}, abstract = {Cluster analysis via a finite mixture model approach is considered. With this approach to clustering, the data can be partitioned into a specified number of clusters g by first fitting a mixture model with g components. An outright clustering of the data is then obtained by assigning an observation to the component to which it has the highest estimated posterior probability of belonging; that is, the ith cluster consists of those observations assigned to the ith component (i = 1,..., g). The focus is on the use of mixtures of normal components for the cluster analysis of data that can be regarded as being continuous. But attention is also given to the case of mixed data, where the observations consist of both continuous and discrete variables.}, keywords = {EM ALGORITHM; LIKELIHOOD; ANALYZERS; CRITERIA; CHOICE}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @INPROCEEDINGS{Matthiessen2003, author = {Matthiessen, B. and Fock, H. O.}, title = {A null model for the analysis of dietary overlap in Macroramphosus spp. at the Great Meteor Seamount (subtropical North-east Atlantic)}, booktitle = {Census of Marine Life Symposium}, year = {2003}, pages = {294-304}, address = {Newport, OR}, abstract = {Macroramphosus spp. (Centriscidae, Macroramphosinae) are the most abundant demersal fishes on the Great Meteor Seamount (GMR, subtropical NE Atlantic, 30degrees N, 28.5degrees W). Previous investigations evidenced the existence of two species that differed in morphology and diet. A deep-bodied benthos feeding type (b-type), M. scolopax, could be significantly distinguished from a slender planktivorous type (p-type), M. gracilis. In this study we investigate whether dietary differences are also significant. Based on a novel null model termed ortho-RA3 in accordance with Lawlor (1980), the analysis of dietary overlap revealed significant differences between feeding types for two of three hypotheses that were tested, i.e., between-group differences and within-group differences for M. gracilis. Reasons for the failure of the third hypothesis (within-group differences for M scolopax) are discussed. Our results 4 reinforce the hypothesis of two competing species of snipefishes at GMR.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Marriott1982, author = {Marriott, F. H. C.}, title = {Optimization methods of cluster analysis}, journal = {Biometrika}, year = {1982}, volume = {69}, pages = {417-421}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Marriott1971, author = {Marriott, F. H.}, title = {Practical problems in a method of cluster analysis}, journal = {Biometrics}, year = {1971}, volume = {27}, pages = {501-\&}, number = {3}, note = {Times Cited: 73}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Maronna1974, author = {Maronna, R. and Jacovkis, P. M.}, title = {Multivariate clustering procedures with variable metrics}, journal = {Biometrics}, year = {1974}, volume = {30}, pages = {499-505}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Manton2004, author = {Manton, K. G. and Gu, X. L. and Huang, H. and Kovtun, M.}, title = {Fuzzy set analyses of genetic determinants of health and disability status}, journal = {Statistical Methods in Medical Research}, year = {2004}, volume = {13}, pages = {395-408}, number = {5}, abstract = {Analyses of complex genotype-phenotype relations require new statistical procedures because of the potentially high dimensionability of those relations which are expressed with both measurement error and stochasticity in the correlation function. We propose modifying a multivariate procedure called grade of membership (GoM) analysis to deal with the special problems of such analyses. In doing so, we make clear some special features of the GoM model for multivariate analysis of high dimensional, discrete data. This is illustrated for apolipoprotein E (APOE) assessments made on 1805 people in the 1999 National Long Term Care Survey. A number of interesting relations with APOE polymorphism were found where disability profiles were more predictive than specific diagnoses because they implicitly contained information on chronicity and severity of disease processes.}, keywords = {APOLIPOPROTEIN-E POLYMORPHISM; CORONARY-ARTERY DISEASE; CELL-PROLIFERATION INDEXES; E PHENOTYPE; MYOCARDIAL-INFARCTION; HEART-DISEASE; E GENOTYPE; PLASMA-LIPOPROTEINS; RISK FACTOR; E ISOFORMS}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @MISC{Makoto/Takuji, author = {Makoto/Takuji}, title = {Mersenne Twister RNG website}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Lunn1998, author = {Lunn, A. D. and Davies, S. J.}, title = {A note on generating correlated binary variables}, journal = {Biometrika}, year = {1998}, volume = {85}, pages = {487-490}, number = {2}, abstract = {It is important to be able to generate correlated binary data in an efficient easily programmed manner for, among other things, the generation of large bootstrap samples. In this note nothing more than the basic ingredient of the uniform random number generator is required for simulating binary data from the most commonly occurring correlation structures.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Loganantharaj2006, author = {Loganantharaj, R. and Cheepala, S. and Clifford, J.}, title = {Metric for measuring the effectiveness of clustering of DNA microarray expression}, journal = {Bmc Bioinformatics}, year = {2006}, volume = {7}, note = {3rd Annual Conference of the MidSouth-Computational-Biology-and-Bioformatics-Society MAR 02-04, 2006 Baton Rouge, LA}, abstract = {Background: The recent advancement of microarray technology with lower noise and better affordability makes it possible to determine expression of several thousand genes simultaneously. The differentially expressed genes are filtered first and then clustered based on the expression profiles of the genes. A large number of clustering algorithms and distance measuring matrices are proposed in the literature. The popular ones among them include hierarchal clustering and k-means clustering. These algorithms have often used the Euclidian distance or Pearson correlation distance. The biologists or the practitioners are often confused as to which algorithm to use since there is no clear winner among algorithms or among distance measuring metrics. Several validation indices have been proposed in the literature and these are based directly or indirectly on distances; hence a method that uses any of these indices does not relate to any biological features such as biological processes or molecular functions. Results: In this paper we have proposed a metric to measure the effectiveness of clustering algorithms of genes by computing inter-cluster cohesiveness and as well as the intra-cluster separation with respect to biological features such as biological processes or molecular functions. We have applied this metric to the clusters on the data set that we have created as part of a larger study to determine the cancer suppressive mechanism of a class of chemicals called retinoids. We have considered hierarchal and k-means clustering with Euclidian and Pearson correlation distances. Our results show that genes of similar expression profiles are more likely to be closely related to biological processes than they are to molecular functions. The findings have been supported by many works in the area of gene clustering. Conclusion: The best clustering algorithm of genes must achieve cohesiveness within a cluster with respect to some biological features, and as well as maximum separation between clusters in terms of the distribution of genes of a behavioral group across clusters. We claim that our proposed metric is novel in this respect and that it provides a measure of both inter and intra cluster cohesiveness. Best of all, computation of the proposed metric is easy and it provides a single quantitative value, which makes comparison of different algorithms easier. The maximum cluster cohesiveness and the maximum intra-cluster separation are indicated by the metric when its value is 0. We have demonstrated the metric by applying it to a data set with gene behavioral groupings such as biological process and molecular functions. The metric can be easily extended to other features of a gene such as DNA binding sites and protein-protein interactions of the gene product, special features of the intron-exon structure, promoter characteristics, etc. The metric can also be used in other domains that use two different parametric spaces; one for clustering and the other one for measuring the effectiveness.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Gordon1994, author = {Gordon, A. D.}, title = {Identifying genuine clusters in a classification}, journal = {Computational Statistics \& Data Analysis}, year = {1994}, volume = {18}, pages = {561-581}, number = {5}, abstract = {The paper addresses the problem of assessing the validity of clusters produced by a clustering procedure. Several null models for data are described. Previous research is reviewed, it being shown how much of it can be formulated in terms of properties of sets of within-cluster and between-cluster painwise dissimilarities. A Monte Carlo test for assessing the value of a U-statistic based on these sets of painwise dissimilarities is described and illustrated on four data sets. The final section includes further discussion of ways of specifying relevant null models.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Gange1995, author = {Gange, S. J.}, title = {Generating multivariate categorical variates using the iterative proportional fitting algorithm}, journal = {American Statistician}, year = {1995}, volume = {49}, pages = {134-138}, number = {2}, abstract = {Two recent papers have suggested methods for generating correlated binary data with fixed marginal distributions and specified degrees of pairwise association. Emrich and Piedmonte suggested a method based on the existence of a multivariate normal distribution, while Lee suggested methods based on linear programming and Archimedian copulas. In this paper, a simpler method is described using the iterative proportional fitting algorithm for generating an n-dimensional distribution of correlated categorical data with specified margins of dimension 1,2,..., k < n. An example of generating a distribution for a generalized estimating equations (GEE) model is discussed.}, keywords = {CORRELATED OUTCOMES; GENERALIZED ESTIMATING EQUATIONS; LOG-LINEAR MODELS; RANDOM NUMBER GENERATION BINARY; ASSOCIATION; REGRESSION}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Fukunaga1970, author = {Fukunaga, K. and Koontz, W. L. G.}, title = {A criterion and an algorithm for grouping data}, journal = {Ieee Transactions on Computers}, year = {1970}, volume = {C 19}, pages = {917-\&}, number = {10}, note = {Times Cited: 33}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Friedman1967, author = {Friedman, H. P. and Rubin, J.}, title = {On some invariant criteria for grouping data}, journal = {Journal of the American Statistical Association}, year = {1967}, volume = {62}, pages = {1159-\&}, number = {320}, note = {Times Cited: 218}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Everitt1979, author = {Everitt, B. S.}, title = {Unresolved problems in cluster analysis}, journal = {Biometrics}, year = {1979}, volume = {35}, pages = {169-181}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Edwards1965, author = {Edwards, A. W. F. and Cavallis.Ll}, title = {A method for cluster analysis}, journal = {Biometrics}, year = {1965}, volume = {21}, pages = {362-\&}, number = {2}, note = {Times Cited: 160}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Dutta2005, author = {Dutta, M. and Mahanta, A. K. and Pujari, A. K.}, title = {QROCK: A quick version of the ROCK algorithm for clustering of categorical data}, journal = {Pattern Recognition Letters}, year = {2005}, volume = {26}, pages = {2364-2373}, number = {15}, abstract = {The ROCK algorithm is an agglomerative hierarchical clustering algorithm for clustering categorical data [Guha S., Rastogi, R., Shim, K., 1999. ROCK: A robust clustering algorithm for categorical attributes. In: Proc. IEEE Internat. Conf. Data Engineering, Sydney, March 1999]. In this paper we prove that under certain conditions, the final clusters obtained by the algorithm are nothing but the connected components of a certain graph with the input data-points as vertices. We propose a new algorithm QROCK which computes the clusters by determining the connected components of the graph. This leads to a very efficient method of obtaining the clusters giving a drastic reduction of the computing time of the ROCK algorithm. We also justify that it is more practical for specifying the similarity threshold rather than specifying the desired number of clusters a priori. The QROCK algorithm also detects the outliers in this process. We also discuss a new similarity measure for categorical attributes. (c) 2005 Elsevier B.V. All rights reserved.}, keywords = {data clustering; categorical attributes; hierarchical clustering; similarity measure}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Dubes1987, author = {Dubes, R. C.}, title = {How many clusters are best - an experiment}, journal = {Pattern Recognition}, year = {1987}, volume = {20}, pages = {645-663}, number = {6}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Dubes1979, author = {Dubes, R. and Jain, A. K.}, title = {Validity studies in clustering methodologies}, journal = {Pattern Recognition}, year = {1979}, volume = {11}, pages = {235-254}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Dubes1976, author = {Dubes, R. and Jain, A. K.}, title = {Clustering techniques - users dilemma}, journal = {Pattern Recognition}, year = {1976}, volume = {8}, pages = {247-260}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Leisch1998, author = {Leisch and Weingessel}, title = {wp13 simulating binary data}, year = {1998}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Lee1979, author = {Lee, K. L.}, title = {Multivariate tests for clusters}, journal = {Journal of the American Statistical Association}, year = {1979}, volume = {74}, pages = {708-714}, number = {367}, note = {Times Cited: 35}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Kuncheva2006, author = {Kuncheva, L. I. and Vetrov, D. P.}, title = {Evaluation of stability of k-means cluster ensembles with respect to random initialization}, journal = {Ieee Transactions on Pattern Analysis and Machine Intelligence}, year = {2006}, volume = {28}, pages = {1798-1808}, number = {11}, note = {Times Cited: 6}, abstract = {Many clustering algorithms, including cluster ensembles, rely on a random component. Stability of the results across different runs is considered to be an asset of the algorithm. The cluster ensembles considered here are based on k-means clusterers. Each clusterer is assigned a random target number of clusters, k and is started from a random initialization. Here, we use 10 artificial and 10 real data sets to study ensemble stability with respect to random k, and random initialization. The data sets were chosen to have a small number of clusters (two to seven) and a moderate number of data points (up to a few hundred). Pairwise stability is defined as the adjusted Rand index between pairs of clusterers in the ensemble, averaged across all pairs. Nonpairwise stability is defined as the entropy of the consensus matrix of the ensemble. An experimental comparison with the stability of the standard k-means algorithm was carried out for k from 2 to 20. The results revealed that ensembles are generally more stable, markedly so for larger k. To establish whether stability can serve as a cluster validity index, we first looked at the relationship between stability and accuracy with respect to the number of clusters, k. We found that such a relationship strongly depends on the data set, varying from almost perfect positive correlation (0.97, for the glass data) to almost perfect negative correlation (-0.93, for the crabs data). We propose a new combined stability index to be the sum of the pairwise individual and ensemble stabilities. This index was found to correlate better with the ensemble accuracy. Following the hypothesis that a point of stability of a clustering algorithm corresponds to a structure found in the data, we used the stability measures to pick the number of clusters. The combined stability index gave best results.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Krzanowski1988, author = {Krzanowski, W. J. and Lai, Y. T.}, title = {A criterion for determining the number of groups in a data set using sum-of-squares clustering}, journal = {Biometrics}, year = {1988}, volume = {44}, pages = {23-34}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Krummenauer1998, author = {Krummenauer, F.}, title = {Efficient simulation of multivariate binomial and Poisson distributions}, journal = {Biometrical Journal}, year = {1998}, volume = {40}, pages = {823-832}, number = {7}, abstract = {Power investigations, for example, in statistical procedures for the assessment of agreement among multiple raters often require the simultaneous simulation of several dependent binomial or Poisson distributions to appropriately model the stochastical dependencies between the raters' results. Regarding the rather large dimensions of the random vectors to be generated and the even larger number of interactions to be introduced into the simulation scenarios to determine all necessary information on their distributions' dependence stucture, one needs efficient and fast algorithms for the simulation of multivariate Poisson and binomial distributions. Therefore two equivalent models for the multivariate Poisson distribution are combined to obtain an algorithm for the quick implementation of its multivariate dependence structure. Simulation of the multivariate Poisson distribution then becomes feasible by first generating and then convoluting independent univariate Poisson variates with appropriate expectations. The latter can be computed via linear recursion formulae. Similar means for simulation are also considered for the binomial setting. In this scenario it turns out, however, that exact computation of the probability function is even easier to perform; therefore corresponding linear recursion formulae for the point probabilities of multivariate binomial distributions are presented, which only require information about the index parameter and the (simultaneous) success probabilities, that is the multivariate dependence structure among the binomial marginals.}, keywords = {multivariate discrete distributions; dependence structure; binomial distribution; Poisson distribution; simulation}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Krieger1999, author = {Krieger, A. M. and Green, P. E.}, title = {A cautionary note on using internal cross validation to select the number of clusters}, journal = {Psychometrika}, year = {1999}, volume = {64}, pages = {341-353}, number = {3}, abstract = {A highly popular method for examining the stability of a data clustering is to split the data into two parts, cluster the observations in Part A, assign the objects in Part B to their nearest centroid in Part A, and then independently cluster the Part B objects. One then examines how close the two partitions are (say, by the Rand measure). Another proposal is to split the data into k parts, and see how their centroids cluster. By means of synthetic data analyses, we demonstrate that these approaches fail to identify the appropriate number of clusters, particularly as sample size becomes large and the variables exhibit higher correlations.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Kapp2007, author = {Kapp, A. V. and Tibshirani, R.}, title = {Are clusters found in one dataset present in another dataset?}, journal = {Biostatistics}, year = {2007}, volume = {8}, pages = {9-31}, number = {1}, abstract = {In many microarray studies, a cluster defined on one dataset is sought in an independent dataset. If the cluster is found in the new dataset, the cluster is said to be "reproducible" and may be biologically significant. Classifying a new datum to a previously defined cluster can be seen as predicting which of the previously defined clusters is most similar to the new datum. If the new data classified to a cluster are similar, molecularly or clinically, to the data already present in the cluster, then the cluster is reproducible and the corresponding prediction accuracy is high. Here, we take advantage of the connection between reproducibility and prediction accuracy to develop a validation procedure for clusters found in datasets independent of the one in which they were characterized. We define a cluster quality measure called the "in-group proportion" (IGP) and introduce a general procedure for individually validating clusters. Using simulations and real breast cancer datasets, the IGP is compared to four other popular cluster quality measures (homogeneity score, separation score, silhouette width, and weighted average discrepant pairs score). Moreover, simulations and the real breast cancer datasets are used to compare the four versions of the validation procedure which all use the IGP, but differ in the way in which the null distributions are generated. We find that the IGP is the best measure of prediction accuracy, and one version of the validation procedure is the more widely applicable than the other three. An implementation of this algorithm is in a package called "clusterRepro" available through The Comprehensive R Archive Network (http://cran.r-project.org).}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @INPROCEEDINGS{Juan2004a, author = {Juan, A. and Vidal, E.}, title = {Bernoulli mixture models for binary images}, booktitle = {17th International Conference on Pattern Recognition (ICPR)}, year = {2004}, editor = {Kittler, J. Petrou M. Nixon M.}, pages = {367-370}, address = {Cambridge, ENGLAND}, abstract = {Mixture modelling is a hot area in pattern recognition. Although most research in this area has focused on mixtures for continuous data, there are many pattern recognition tasks for which binary or discrete mixtures are better suited. This paper focuses on the use of Bernoulli mixtures for binary data and, in particular for binary images. Results are reported on a task of handwritten Indian digits.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @INPROCEEDINGS{Juan2001, author = {Juan, A. and Vidal, E.}, title = {On the use of Bernoulli mixture models for text classification}, booktitle = {1st International Workshop on Pattern Recognition in Information Systems}, year = {2001}, pages = {2705-2710}, address = {Setubal, Portugal}, abstract = {Mixture modelling of class-conditional densities is a standard pattern recognition technique. Although most research on mixture models has concentrated on mixtures for continuous data, emerging pattern recognition applications demand extending research efforts to other data types. This paper focuses on the application of mixtures of multivariate Bernoulli distributions to binary data. More concretely, a text classification task aimed at improving language modelling for machine translation is considered. (C) 2002 Pattern Recognition Society. Published by Elsevier Science Ltd. All rights reserved.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Yin2008, author = {Yin, Z. and Zhou, X. B. and Bakal, C. and Li, F. H. and Sun, Y. X. and Perrimon, N. and Wong, S. T.}, title = {Using iterative cluster merging with improved gap statistics to perform online phenotype discovery in the context of high-throughput RNAi screens}, journal = {Bmc Bioinformatics}, year = {2008}, volume = {9}, abstract = {Background: The recent emergence of high-throughput automated image acquisition technologies has forever changed how cell biologists collect and analyze data. Historically, the interpretation of cellular phenotypes in different experimental conditions has been dependent upon the expert opinions of well-trained biologists. Such qualitative analysis is particularly effective in detecting subtle, but important, deviations in phenotypes. However, while the rapid and continuing development of automated microscope-based technologies now facilitates the acquisition of trillions of cells in thousands of diverse experimental conditions, such as in the context of RNA interference (RNAi) or small-molecule screens, the massive size of these datasets precludes human analysis. Thus, the development of automated methods which aim to identify novel and biological relevant phenotypes online is one of the major challenges in high-throughput image-based screening. Ideally, phenotype discovery methods should be designed to utilize prior/existing information and tackle three challenging tasks, i.e. restoring pre-defined biological meaningful phenotypes, differentiating novel phenotypes from known ones and clarifying novel phenotypes from each other. Arbitrarily extracted information causes biased analysis, while combining the complete existing datasets with each new image is intractable in high-throughput screens. Results: Here we present the design and implementation of a novel and robust online phenotype discovery method with broad applicability that can be used in diverse experimental contexts, especially high-throughput RNAi screens. This method features phenotype modelling and iterative cluster merging using improved gap statistics. A Gaussian Mixture Model (GMM) is employed to estimate the distribution of each existing phenotype, and then used as reference distribution in gap statistics. This method is broadly applicable to a number of different types of image-based datasets derived from a wide spectrum of experimental conditions and is suitable to adaptively process new images which are continuously added to existing datasets. Validations were carried out on different dataset, including published RNAi screening using Drosophila embryos [ Additional files 1, 2], dataset for cell cycle phase identification using HeLa cells [ Additional files 1, 3, 4] and synthetic dataset using polygons, our methods tackled three aforementioned tasks effectively with an accuracy range of 85%-90%. When our method is implemented in the context of a Drosophila genome-scale RNAi image-based screening of cultured cells aimed to identifying the contribution of individual genes towards the regulation of cell-shape, it efficiently discovers meaningful new phenotypes and provides novel biological insight. We also propose a two-step procedure to modify the novelty detection method based on one-class SVM, so that it can be used to online phenotype discovery. In different conditions, we compared the SVM based method with our method using various datasets and our methods consistently outperformed SVM based method in at least two of three tasks by 2% to 5%. These results demonstrate that our methods can be used to better identify novel phenotypes in image-based datasets from a wide range of conditions and organisms. Conclusion: We demonstrate that our method can detect various novel phenotypes effectively in complex datasets. Experiment results also validate that our method performs consistently under different order of image input, variation of starting conditions including the number and composition of existing phenotypes, and dataset from different screens. In our findings, the proposed method is suitable for online phenotype discovery in diverse high-throughput image-based genetic and chemical screens.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @INPROCEEDINGS{Juan2004, author = {Juan, A. and Garcia-Hernandez, J. and Vidal, E.}, title = {EM initialisation for Bernoulli mixture learning}, booktitle = {10th International Workshop on Structural and Syntactic Pattern Recognition/5th International Conference on Statistical Techniques in Pattern Recognition}, year = {2004}, editor = {Fred, A. Caelli T. Duin R. P. W. Campilho A. DeRidder D.}, pages = {635-643}, address = {Lisbon, PORTUGAL}, abstract = {Mixture modelling is a hot area in pattern recognition. This paper focuses on the use of Bernoulli mixtures for binary data and, in particular, for binary images. More specifically, six EM initialisation techniques are described and empirically compared on a classification task of handwritten Indian digits. Somehow surprisingly, we have found that a relatively good initialisation for Bernoulli prototypes is to use slightly perturbed versions of the hypercube centre.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Yeung2001a, author = {Yeung, K. Y. and Ruzzo, W. L.}, title = {Principal component analysis for clustering gene expression data}, journal = {Bioinformatics}, year = {2001}, volume = {17}, pages = {763-774}, number = {9}, abstract = {Motivation: There is a great need to develop analytical methodology to analyze and to exploit the information contained in gene expression data. Because of the large number of genes and the complexity of biological networks, clustering is a useful exploratory technique for analysis of gene expression data. Other classical techniques, such as principal component analysis (PCA), have also been applied to analyze gene expression data. Using different data analysis techniques and different clustering algorithms to analyze the same data set can lead to very different conclusions. Our goal is to study the effectiveness of principal components (PCs) in capturing cluster structure. Specifically, using both real and synthetic gene expression data sets, we compared the quality of clusters obtained from the original data to the quality of clusters obtained after projecting onto subsets of the principal component axes. Results: Our empirical study showed that clustering with the PCs instead of the original variables does not necessarily improve, and often degrades, cluster quality. In particular, the first few PCs (which contain most of the variation in the data) do not necessarily capture most of the cluster structure. We also showed that clustering with PCs has different impact on different algorithms and different similarity metrics. Overall, we would not recommend PCA before clustering except In special circumstances.}, keywords = {PATTERNS; CRITERIA; ARRAY}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Yeung2001, author = {Yeung, K. Y. and Haynor, D. R. and Ruzzo, W. L.}, title = {Validating clustering for gene expression data}, journal = {Bioinformatics}, year = {2001}, volume = {17}, pages = {309-318}, number = {4}, abstract = {Motivation: Many clustering algorithms have been proposed for the analysis of gene expression data, but little guidance is available to help choose among them. We provide a systematic framework for assessing the results of clustering algorithms. Clustering algorithms attempt to partition the genes into groups exhibiting similar patterns of variation in expression level. Our methodology is to apply a clustering algorithm to the data from all but one experimental condition. The remaining condition is used to assess the predictive power of the resulting clusters-meaningful clusters should exhibit less variation in the remaining condition than clusters formed by chance. Results: We successfully applied our methodology to compare six clustering algorithms on four gene expression data sets. We found our quantitative measures of cluster quality to be positively correlated with external standards of cluster quality.}, keywords = {PATTERNS; NUMBER}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Dolcinar1998, author = {Dolcinar}, title = {WP19- more comparison}, year = {1998}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Windham1987, author = {Windham, M. P.}, title = {Parameter modification for clustering criteria}, journal = {Journal of Classification}, year = {1987}, volume = {4}, pages = {191-214}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Dolcinar1998a, author = {Dolcinar}, title = {WP7: comparison of clustering algorithms}, year = {1998}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Waller1999, author = {Waller, N. G. and Underhill, J. M. and Kaiser, H. A.}, title = {A method for generating simulated plasmodes and artificial test clusters with user-defined shape, size, and orientation}, journal = {Multivariate Behavioral Research}, year = {1999}, volume = {34}, pages = {123-142}, number = {2}, abstract = {We present a simple method for generating simulated plasmodes and artificial test clusters with user-defined shape, size, and orientation. Our method differs from other cluster generation techniques in that it focuses on the validity of the duster indicators. For J clusters, indicator validity is defined as the squared correlation ratio between the cluster indicator (i.e., the observed variable) and J-1 dummy variables. The within-cluster correlation structure and the univariate distributions of the cluster indicators are specified with procedures outlined by Fleishman (1978) and Vale and Maurelli (1983). Simulation results illustrate the utility of the method for cluster analysis evaluation research.}, keywords = {MIXTURE MODEL TESTS; ANALYSIS ALGORITHMS; ACCURACY}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Dimitriadou2002, author = {Dimitriadou, E. and Dolnicar, S. and Weingessel, A.}, title = {An examination of indexes for determining the number of clusters in binary data sets}, journal = {Psychometrika}, year = {2002}, volume = {67}, pages = {137-159}, number = {1}, abstract = {The problem of choosing the correct number of clusters is as old as cluster analysis itself. A number of authors have suggested various indexes to facilitate this crucial decision. One of the most extensive comparative studies of indexes was conducted by Milligan and Cooper (1985). The present piece of work pursues the same goal under different conditions. In contrast to Milligan and Cooper's work, the emphasis here is on high-dimensional empirical binary data. Binary artificial data sets are constructed to reflect features typically encountered in real-world data situations in the field of marketing research. The simulation includes 162 binary data sets that are clustered by two different algorithms and lead to recommendations on the number of clusters for each index under consideration. Index results are evaluated and their performance is compared and analyzed.}, keywords = {number of clusters; clustering indexes; binary data; artificial data sets; market segmentation SIMILARITY; COEFFICIENTS}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Vassiliou1989, author = {Vassiliou, A. and Ignatiades, L. and Karydis, M.}, title = {Clustering of transect phytoplankton collections with a quick randomization algorithm}, journal = {Journal of Experimental Marine Biology and Ecology}, year = {1989}, volume = {130}, pages = {135-145}, number = {2}, note = {Times Cited: 9}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Datta2003, author = {Datta, S. and Datta, S.}, title = {Comparisons and validation of statistical clustering techniques for microarray gene expression data}, journal = {Bioinformatics}, year = {2003}, volume = {19}, pages = {459-466}, number = {4}, abstract = {Motivation: With the advent of microarray chip technology, large data sets are emerging containing the simultaneous expression levels of thousands of genes at various time points during a biological process. Biologists are attempting to group genes based on the temporal pattern of their expression levels. While the use of hierarchical clustering (UPGMA) with correlation 'distance' has been the most common in the microarray studies, there are many more choices of clustering algorithms in pattern recognition and statistics literature. At the moment there do not seem to be any clear-cut guidelines regarding the choice of a clustering algorithm to be used for grouping genes based on their expression profiles. Results: In this paper, we consider six clustering algorithms (of various flavors!) and evaluate their performances on a well-known publicly available microarray data set on sporulation of budding yeast and on two simulated data sets. Among other things, we formulate three reasonable validation strategies that can be used with any clustering algorithm when temporal observations or replications are present. We evaluate each of these six clustering methods with these validation measures. While the 'best' method is dependent on the exact validation strategy and the number of clusters to be used, overall Diana appears to be a solid performer. Interestingly, the performance of correlation-based hierarchical clustering and model-based clustering (another method that has been advocated by a number of researchers) appear to be on opposite extremes, depending on what validation measure one employs. Next it is shown that the group means produced by Diana are the closest and those produced by UPGMA are the farthest from a model profile based on a set of hand-picked genes.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Chaganty2006, author = {Chaganty, N. R. and Joe, H.}, title = {Range of correlation matrices for dependent Bernoulli random variables}, journal = {Biometrika}, year = {2006}, volume = {93}, pages = {197-206}, number = {1}, abstract = {We say that a pair (p, R) is compatible if there exists a multivariate binary distribution with mean vector p and correlation matrix R. In this paper we study necessary and sufficient conditions for compatibility for structured and unstructured correlation matrices. We give examples of correlation matrices that are incompatible with any p. Using our results we show that the parametric binary models of Emrich & Piedmonte (1991) and Qaqish (2003) allow a good range of correlations between the binary variables. We also obtain necessary and sufficient conditions for a matrix of odds ratios to be compatible with a given p. Our findings support the popular belief that the odds ratios are less constrained and more flexible than the correlations.}, keywords = {Frechet bound; generalised estimating equation; multivariate binary; odds ratio BINARY VARIABLES; DISTRIBUTIONS}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Cesario2007, author = {Cesario, E. and Manco, G. and Ortale, R.}, title = {Top-down parameter-free clustering of high-dimensional categorical data}, journal = {Ieee Transactions on Knowledge and Data Engineering}, year = {2007}, volume = {19}, pages = {1607-1624}, number = {12}, note = {Times Cited: 0}, abstract = {A parameter-free, fully-automatic approach to clustering high-dimensional categorical data is proposed. The technique is based on a two-phase iterative procedure, which attempts to improve the overall quality of the whole partition. In the first phase, cluster assignments are given, and a new cluster is added to the partition by identifying and splitting a low-quality cluster. In the second phase, the number of clusters is fixed, and an attempt to optimize cluster assignments is done. On the basis of such features, the algorithm attempts to improve the overall quality of the whole partition and finds clusters in the data, whose number is naturally established on the basis of the inherent features of the underlying data set rather than being previously specified. Furthermore, the approach is parametric to the notion of cluster quality: Here, a cluster is defined as a set of tuples exhibiting a sort of homogeneity. We show how a suitable notion of cluster homogeneity can be defined in the context of high-dimensional categorical data, from which an effective instance of the proposed clustering scheme immediately follows. Experiments on both synthetic and real data prove that the devised algorithm scales linearly and achieves nearly optimal results in terms of compactness and separation.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Cary, author = {Cary, A. J.}, title = {Generating data with the SAS dataset}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Carreira-Perpinan2000, author = {Carreira-Perpinan, M. A. and Renals, S.}, title = {Practical identifiability of finite mixtures of multivariate Bernoulli distributions}, journal = {Neural Computation}, year = {2000}, volume = {12}, pages = {141-152}, number = {1}, note = {Times Cited: 9}, abstract = {The class of finite mixtures of multivariate Bernoulli distributions is known to be nonidentifiable; that is, different values of the mixture parameters can correspond to exactly the same probability distribution. In principle, this would mean that sample estimates using this model would give rise to different interpretations. We give empirical support to the fact that estimation of this class of mixtures can still produce meaningful results in practice, thus lessening the importance of the identifiability problem. We also show that the expectation-maximization algorithm is guaranteed to converge to a proper maximum likelihood estimate, owing to a property of the log-likelihood surface. Experiments with synthetic data sets show that an original generating distribution can be estimated from a sample. Experiments with an electropalatography data set show important structure in the data.}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Calinski1968, author = {Calinski, T.}, title = {A dendrite method for cluster analysis}, journal = {Biometrics}, year = {1968}, volume = {24}, pages = {207-\&}, number = {1}, note = {Times Cited: 2}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Bryant1991, author = {Bryant, P. G.}, title = {Large-sample results for optimization-based clustering methods}, journal = {Journal of Classification}, year = {1991}, volume = {8}, pages = {31-44}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.01.20} } @ARTICLE{Pickles1995, author = {Pickles, A. and Bolton, P. and Macdonald, H. and Bailey, A. and Lecouteur, A. and Sim, C. H. and Rutter, M.}, title = {Latent-class analysis of recurrence risks for complex phenotypes with selection and measurement error - a twin and family history study of autism}, journal = {American Journal of Human Genetics}, year = {1995}, volume = {57}, pages = {717-726}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.01.21} } @ARTICLE{McLachlan1987, author = {McLachlan, G. J.}, title = {On bootstrapping the likelihood ratio test statistic for the number of components in a normal mixture}, journal = {Applied Statistics-Journal of the Royal Statistical Society Series C}, year = {1987}, volume = {36}, pages = {318-324}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.01.21} } @ARTICLE{Loken2004, author = {Loken, E.}, title = {Using latent class analysis to model temperament types}, journal = {Multivariate Behavioral Research}, year = {2004}, volume = {39}, pages = {625-652}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.01.21} } @ARTICLE{Lanza2007, author = {Lanza, S. T. and Collins, L. M. and Lemmon, D. R. and Schafer, J. L.}, title = {PROC LCA: A SAS procedure for latent class analysis}, journal = {Structural Equation Modeling-a Multidisciplinary Journal}, year = {2007}, volume = {14}, pages = {671-694}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.01.21} } @ARTICLE{Holt1989, author = {Holt, J. A. and Macready, G. B.}, title = {A simulation study of the difference Chi-square statistic for comparing latent class models under violation of regularity conditions}, journal = {Applied Psychological Measurement}, year = {1989}, volume = {13}, pages = {221-231}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.01.21} } @ARTICLE{Feng1996, author = {Feng, Z. D. and McCulloch, C. E.}, title = {Using bootstrap likelihood ratios in finite mixture models}, journal = {Journal of the Royal Statistical Society Series B-Methodological}, year = {1996}, volume = {58}, pages = {609-617}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.01.21} } @ARTICLE{Everitt1988, author = {Everitt, B. S.}, title = {A Monte-Carlo investigation of the likelihood ratio test for number of classes in latent class analysis}, journal = {Multivariate Behavioral Research}, year = {1988}, volume = {23}, pages = {531-538}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.01.21} } @ARTICLE{Desoete1991, author = {Desoete, G. and Desarbo, W. S.}, title = {A latent class probit model for analyzing pick any/n data}, journal = {Journal of Classification}, year = {1991}, volume = {8}, pages = {45-63}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.01.21} } @ARTICLE{Banfield1993, author = {Banfield, J. D. and Raftery, A. E.}, title = {Model-based gaussian and non-gaussian clustering}, journal = {Biometrics}, year = {1993}, volume = {49}, pages = {803-821}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.01.21} } @ARTICLE{Aitkin1981, author = {Aitkin, M. and Anderson, D. and Hinde, J.}, title = {Statistical modeling of data on teaching styles}, journal = {Journal of the Royal Statistical Society Series a-Statistics in Society}, year = {1981}, volume = {144}, pages = {419-461}, note = {Part 4}, owner = {rebeccaspeckman}, timestamp = {2009.01.21} } @BOOK{Krzanowski, title = {Multivariate Analysis Part 2: Classification, covariance structures and repeated measurements}, publisher = {Halsted Press}, author = {Krzanowski, W. J. and Marriott, F. H. C.}, owner = {rebeccaspeckman}, timestamp = {2009.01.22} } @ARTICLE{Garrett2000, author = {Garrett, E. S. and Zeger, S. L.}, title = {Latent class model diagnosis}, journal = {Biometrics}, year = {2000}, volume = {56}, pages = {1055-1067}, number = {4}, abstract = {In many areas of medical research, such as psychiatry and gerontology, latent class variables are used to classify individuals into disease categories, often with the intention of hierarchical modeling. Problems arise when it is not clear how many disease classes are appropriate, creating a need for model selection and diagnostic techniques. Previous work has shown that the Pearson chi (2) statistic and the log-likelihood ratio G(2) statistic are not valid test statistics for evaluating latent class models. Other methods, such as information criteria, provide decision rules without providing explicit information about where discrepancies occur between a model and the data. Identifiability issues further complicate these problems. This paper develops procedures for assessing Markov chain Monte Carlo convergence and model diagnosis and for selecting the number of categories for the latent variable based on evidence in the data using Markov chain Monte Carlo techniques. Simulations and a psychiatric example are presented to demonstrate the effective use of these methods.}, owner = {rebeccaspeckman}, timestamp = {2009.01.23} } @ARTICLE{Garrett2002, author = {Garrett, E. S. and Eaton, W. W. and Zeger, S.}, title = {Methods for evaluating the performance of diagnostic tests in the absence of a gold standard: a latent class model approach}, journal = {Statistics in Medicine}, year = {2002}, volume = {21}, pages = {1289-1307}, number = {9}, abstract = {In many areas of medical research, `gold standard' diagnostic tests do not exist and so evaluating the performance of standardized diagnostic criteria or algorithms is problematic. In this paper we propose an approach to evaluating the operating characteristics of diagnoses using a latent class model. By defining `true disease' as our latent variable, we are able to estimate sensitivity, specificity and negative and positive predictive values of the diagnostic test. These methods are applied to diagnostic criteria for depression using Baltimore's Epidemiologic Catchment Area Study Wave 3 data. Copyright (C) 2002 John Wiley Sons, Ltd.}, owner = {rebeccaspeckman}, timestamp = {2009.01.23} } @ARTICLE{Zeger2003, author = {Zeger, S. L. and Garrett, E. S.}, title = {Rejoinder to "Latent class model diagnosis from a frequentist point of view"}, journal = {Biometrics}, year = {2003}, volume = {59}, pages = {197-198}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.01.23} } @ARTICLE{Uebersax1999, author = {Uebersax, J. S.}, title = {Probit latent class analysis with dichotomous or ordered category measures: Conditional independence/dependence models}, journal = {Applied Psychological Measurement}, year = {1999}, volume = {23}, pages = {283-297}, number = {4}, abstract = {Flexible methods that relax restrictive conditional independence assumptions of latent class analysis (LCA) are described. Dichotomous and ordered category manifest variables are viewed as discretized latent continuous variables. The latent continuous variables are assumed to have a mixture-of-multivariate-normals distribution. Within a latent class, conditional dependence is modeled as the mutual association of all or some latent continuous variables with a continuous latent trait (or in special cases, multiple latent traits). The relaxation of conditional independence assumptions allows LCA to better model natural taxa. Comparisons of specific restricted and unrestricted models permit statistical tests of specific aspects of latent taxonic structure. Latent class, latent trait, and latent distribution analysis can be viewed as special cases of the mixed latent trait model. The relationship between the multivariate probit mixture model proposed here and Rest's mixed Rasch(1990, 1991) model is discussed. Two studies illustrate different uses of the proposed model.}, owner = {rebeccaspeckman}, timestamp = {2009.01.23} } @ARTICLE{Reboussin2008, author = {Reboussin, B. A. and Ip, E. H. and Wolfson, M.}, title = {Locally dependent latent class models with covariates: an application to under-age drinking in the USA}, journal = {Journal of the Royal Statistical Society Series a-Statistics in Society}, year = {2008}, volume = {171}, pages = {877-897}, note = {Part 4}, abstract = {Under-age drinking is a long-standing public health problem in the USA and the identification of underage drinkers suffering alcohol-related problems has been difficult by using diagnostic criteria that were developed in adult populations. For this reason, it is important to characterize patterns of drinking in adolescents that are associated with alcohol-related problems. Latent class analysis is a statistical technique for explaining heterogeneity in individual response patterns in terms of a smaller number of classes. However, the latent class analysis assumption of local independence may not be appropriate when examining behavioural profiles and could have implications for statistical inference. In addition, if covariates are included in the model, non-differential measurement is also assumed. We propose a flexible set of models for local dependence and differential measurement that use easily interpretable odds ratio parameterizations while simultaneously fitting a marginal regression model for the latent class prevalences. Estimation is based on solving a set of second-order estimating equations. This approach requires only specification of the first two moments and allows for the choice of simple 'working' covariance structures. The method is illustrated by using data from a large-scale survey of under-age drinking. This new approach indicates the effectiveness of introducing local dependence and differential measurement into latent class models for selecting substantively interpretable models over more complex models that are deemed empirically superior.}, owner = {rebeccaspeckman}, timestamp = {2009.01.23} } @ARTICLE{Menten2008, author = {Menten, J. and Boelaert, M. and Lesaffre, E.}, title = {Bayesian latent class models with conditionally dependent diagnostic tests: A case study}, journal = {Statistics in Medicine}, year = {2008}, volume = {27}, pages = {4469-4488}, number = {22}, abstract = {In the assessment of the accuracy of diagnostic tests for infectious diseases, the true disease status of the subjects is often unknown due to the lack of a gold standard test. Latent class models with two latent classes, representing diseased and non-diseased subjects, are often used to analyze this type of data. In its basic format, latent class analysis requires the observed outcomes to be statistically independent conditional on the disease status. In most diagnostic settings, this assumption is highly questionable. During the last decade, several methods have been proposed to estimate latent class models with conditional dependence between the test results. A class of flexible fixed and random effects models were described by Dendukuri and Joseph in a Bayesian framework. We illustrate these models using the analysis of a diagnostic study of three field tests and an imperfect reference test for the diagnosis of visceral leishmaniasis. We show that, as observed earlier by Albert and Dodd, different dependence models may result in similar fits to the data while resulting in different inferences. Given this problem, selection of appropriate latent class models should be based on substantive subject matter knowledge. If several clinically plausible models are supported by the data, a sensitivity analysis should be performed by describing the results obtained from different models and using different priors. Copyright (c) 2008 John Wiley & ons, Ltd.}, owner = {rebeccaspeckman}, timestamp = {2009.01.23} } @ARTICLE{Branscum2004, author = {Branscum, A. J. and Gardner, I. A. and Johnson, W. O.}, title = {Bayesian modeling of animal- and herd-level prevalences}, journal = {Preventive Veterinary Medicine}, year = {2004}, volume = {66}, pages = {101-112}, number = {1-4}, abstract = {We reviewed Bayesian approaches for animal-level and herd-level prevalence estimation based on cross-sectional sampling designs and demonstrated fitting of these models using the WinBUGS software. We considered estimation of infection prevalence based on use of a single diagnostic test applied to a single herd with binomial and hypergeometric sampling. We then considered multiple herds under binomial sampling with the primary goal of estimating the prevalence distribution and the proportion of infected herds. A new model is presented that can be used to estimate the herd-level prevalence in a region, including the posterior probability that all herds are non-infected. Using this model, inferences for the distribution of prevalences, mean prevalence in the region, and predicted prevalence of herds in the region (including the predicted probability of zero prevalence) are also available. In the models presented, both animal- and herd-level prevalences are modeled as mixture distributions to allow for zero infection prevalences. (If mixture models for the prevalences were not used, prevalence estimates might be artificially inflated, especially in herds and regions with low or zero prevalence.) Finally, we considered estimation of animal-level prevalence based on pooled samples. (C) 2004 Elsevier B.V. All rights reserved.}, owner = {rebeccaspeckman}, timestamp = {2009.01.23} } @BOOK{Everitt1981, title = {Finite mixture distributions}, publisher = {Chapman and Hall}, year = {1981}, author = {Everitt, Brian and Hand, D. J.}, series = {Monographs on applied probability and statistics}, address = {London ; New York}, keywords = {Mixture distributions (Probability theory)}, owner = {rebeccaspeckman}, timestamp = {2009.01.24} } @BOOK{Everitt1993, title = {Cluster analysis}, publisher = {E. Arnold ; Halsted Press}, year = {1993}, author = {Everitt, Brian}, address = {London New York}, edition = {3rd}, keywords = {Cluster analysis.}, owner = {rebeccaspeckman}, timestamp = {2009.01.24} } @BOOK{Everitt1984, title = {An introduction to latent variable models}, publisher = {Chapman and Hall}, year = {1984}, author = {Everitt, Brian}, series = {Monographs on statistics and applied probability}, address = {London ; New York}, keywords = {Latent variables.}, owner = {rebeccaspeckman}, timestamp = {2009.01.24} } @BOOK{Krzanowski1994, title = {Multivariate analysis}, publisher = {E. Arnold ; Halsted Press}, year = {1994}, author = {Krzanowski, W. J. and Marriott, F. H. C.}, series = {Kendall's library of statistics}, address = {London New York}, keywords = {Multivariate analysis.}, owner = {rebeccaspeckman}, timestamp = {2009.01.24} } @BOOK{Everitt1974, title = {Cluster analysis}, publisher = {Heinemann Educational [for] the Social Science Research Council}, year = {1974}, author = {Everitt, Brian}, address = {London}, keywords = {Cluster analysis.}, owner = {rebeccaspeckman}, timestamp = {2009.01.24} } @BOOK{Dunn1993, title = {Modelling covariances and latent variables using EQS}, publisher = {Chapman \& Hall}, year = {1993}, author = {Dunn, G. and Everitt, Brian and Pickles, Andrew}, address = {London ; New York}, edition = {1st}, note = {93244250 G. Dunn, B. Everitt, and A. Pickles. ill. ; 24 cm. Reference card inserted. Includes bibliographical references (p. [193]-194) and index.}, keywords = {Factor analysis Data processing. Social sciences Statistical methods Data processing.}, owner = {rebeccaspeckman}, timestamp = {2009.01.24} } @BOOK{Everitt1980, title = {Cluster analysis}, publisher = {published on behalf of the Social Science Research Council by Heinemann Educational Books ; Halsted Press}, year = {1980}, author = {Everitt, Brian}, address = {London New York}, edition = {2d}, keywords = {Cluster analysis.}, owner = {rebeccaspeckman}, timestamp = {2009.01.24} } @BOOK{Everitt2001, title = {Cluster analysis}, publisher = {Arnold ; Oxford University Press}, year = {2001}, author = {Everitt, Brian and Landau, Sabine and Leese, Morven}, address = {London New York}, edition = {4th}, keywords = {Cluster analysis.}, owner = {rebeccaspeckman}, timestamp = {2009.01.24} } @ARTICLE{Escobar1995, author = {Escobar, M. D. and West, M.}, title = {Bayesian density estimation and inference using mixtures}, journal = {Journal of the American Statistical Association}, year = {1995}, volume = {90}, pages = {577-588}, number = {430}, abstract = {We describe and illustrate Bayesian inference in models for density estimation using mixtures of Dirichlet processes. These models provide natural settings for density estimation and are exemplified by special eases where data are modeled as a sample from mixtures of normal distributions. Efficient simulation methods are used to approximate various prior, posterior, and predictive distributions. This allows for direct inference on a variety of practical issues, including problems of local versus global smoothing, uncertainty about density estimates, assessment of modality, and the inference on the numbers of components. Also, convergence results are established for a general class of normal mixture models.}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @ARTICLE{Eaton1989, author = {Eaton, W. W. and Dryman, A. and Sorenson, A. and McCutcheon, A.}, title = {DSM-III major depressive disorder in the community: a latent class analysis of data from the NIMH Epidemiologic Catchment Area Program}, journal = {British Journal of Psychiatry}, year = {1989}, volume = {155}, pages = {48-54}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @ARTICLE{Dayton1988, author = {Dayton, C. M. and Macready, G. B.}, title = {Concomitant-variable latent-class models}, journal = {Journal of the American Statistical Association}, year = {1988}, volume = {83}, pages = {173-178}, number = {401}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @INPROCEEDINGS{Collins1992, author = {Collins, L. M. and Fidler, P. L. and Wugalter, S. E. and Long, J. D.}, title = {Goodness-of-fit testing for latent class models}, booktitle = {1992 Annual Meeting of the Soc-of-Multivariate-Experimental-Psychology}, year = {1992}, pages = {375-389}, address = {Chatham, Ma}, abstract = {Latent class models with sparse contingency tables can present problems for model comparison and selection, because under these conditions the distributions of goodness-of-fit indices are often unknown. This causes inaccuracies both in hypothesis testing and in model comparisons based on normed indices. In order to assess the extent of this problem, we carried out a simulation investigating the distributions of the likelihood ratio statistic G2, the Pearson statistic X2, and a new goodness-of-fit index suggested by Read and Cressie (1988). There were substantial deviations between the expectation of the chi-squared distribution and the means of the G2 and Read and Cressie distributions. In general, the mean of the distribution of a statistic was closer to the expectation of the chi-squared distribution when the average cell expectation was large. there were fewer indicator items, and the latent class measurement parameters were less extreme. It was found that the mean of the X2 distribution is generally closer to the expectation of the chi-squared distribution than are the means of the other two indices we examined, but the standard deviation of the X2 distribution is considerably larger than that of the other two indices and larger than the standard deviation of the chi-squared distribution. We argue that a possible solution is to forgo reliance on theoretical distributions for expectations and quantiles of goodness-of-fit statistics. Instead, Monte Carlo sampling (Noreen, 1989) can be used to arrive at an empirical central or noncentral distribution.}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @ARTICLE{Chib1995, author = {Chib, S. and Greenberg, E.}, title = {Understanding the Metropolis-Hastings algorithms}, journal = {American Statistician}, year = {1995}, volume = {49}, pages = {327-335}, number = {4}, abstract = {We provide a detailed, introductory exposition of the Metropolis-Hastings algorithm, a powerful Markov chain method to simulate multivariate distributions. A simple, intuitive derivation of this method is given along with guidance on implementation. Also discussed are two applications of the algorithm, one for implementing acceptance-rejection sampling when a blanketing function is not available and the other for implementing the algorithm with block-at-a-time scans. In the latter situation, many different algorithms, including the Gibbs sampler, are shown to be special cases of the Metropolis-Hastings algorithm. The methods are illustrated with examples.}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @ARTICLE{Casella1992, author = {Casella, G. and George, E. I.}, title = {Explaining the Gibbs sampler}, journal = {American Statistician}, year = {1992}, volume = {46}, pages = {167-174}, number = {3}, abstract = {Computer-intensive algorithms, such as the Gibbs sampler, have become increasingly popular statistical tools, both in applied and theoretical work. The properties of such algorithms, however, may sometimes not be obvious. Here we give a simple explanation of how and why the Gibbs sampler works. We analytically establish its properties in a simple case and provide insight for more complicated cases. There are also a number of examples.}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @ARTICLE{Sullivan1998, author = {Sullivan, P. F. and Kessler, R. C. and Kendler, K. S.}, title = {Latent class analysis of lifetime depressive symptoms in the National Comorbidity Survey}, journal = {American Journal of Psychiatry}, year = {1998}, volume = {155}, pages = {1398-1406}, number = {10}, abstract = {Objective: Although clinical trials have documented the importance of identifying individuals with major depression with atypical features, there are fewer epidemiological data; In a prior report, the authors used latent class analysis (LCA) to identify a distinctive atypical depressive subtype; they sought to replicate that finding in the current study. Method: Using the National Comorbidity Survey data, the authors applied LCA to 14 DSM-III-R major depressive symptoms in the participants' lifetime worst episodes (N = 2,836), Validators of class membership included depressive disorder characteristics, syndrome consequences, demography, comorbidity, personality/attitudes, and parental psychiatric history. Results: The best-Silting LCA solution had six classes. Four were combinations of atypicality and severity: severe atypical, mild atypical, severe typical, and mild typical. Syndrome severity (severe atypical and typical versus mild atypical and typical classes) was associated with a pronounced pattern of more and longer episodes, worse syndrome consequences, increased psychiatric comorbidity, more deviant personality and attitudes, and parental alcohol/drug use disorder. Syndrome atypicality (severe and mild atypical versus severe and mild typical classes) was associated with decreased syndrome consequences, comorbid conduct disorder and social phobia, higher interpersonal dependency and lower self-esteem, and parental alcohol/drug use disorder. Conclusions: As in prior reports,the atypical subtype of depression can be identified in epidemiological samples and, like typical depression, exists in mild and severe variants. Atypical depressive subtypes were characterized by several distinctive features. However, the correspondence between epidemiologically derived typologies of atypical depression and DSM-IV major depression with atypical features is not yet known.}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @ARTICLE{Richardson1997, author = {Richardson, S. and Green, P. J.}, title = {On Bayesian analysis of mixtures with an unknown number of components}, journal = {Journal of the Royal Statistical Society Series B-Methodological}, year = {1997}, volume = {59}, pages = {731-758}, number = {4}, abstract = {New methodology for fully Bayesian mixture analysis is developed, making use of reversible jump Markov chain Monte Carlo methods that are capable of jumping between the parameter subspaces corresponding to different numbers of components in the mixture. A sample from the full joint distribution of all unknown variables is thereby generated, and this can be used as a basis for a thorough presentation of many aspects of the posterior distribution. The methodology is applied here to the analysis of univariate normal mixtures, using a hierarchical prior model that offers an approach to dealing with weak prior information while avoiding the mathematical pitfalls of using improper priors in the mixture context.}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @ARTICLE{Fraley1999, author = {Fraley, C. and Raftery, A. E.}, title = {MCLUST: Software for model-based cluster analysis}, journal = {Journal of Classification}, year = {1999}, volume = {16}, pages = {297-306}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @ARTICLE{DeBacker1999, author = {De Backer, S. and Scheunders, P.}, title = {A competitive elliptical clustering algorithm}, journal = {Pattern Recognition Letters}, year = {1999}, volume = {20}, pages = {1141-1147}, number = {11-13}, abstract = {This paper introduces a new learning algorithm for on-line ellipsoidal clustering. The algorithm is based on the competitive clustering scheme extended by two specific features. Elliptical clustering is accomplished by efficiently incorporating the Mahalanobis distance measure into the learning rules, and underutilization of smaller dusters is avoided by incorporating a frequency-sensitive term. Experiments are conducted to demonstrate the usefulness of the algorithm on artificial data-sets as well as on the problem of texture segmentation. (C) 1999 Elsevier Science B.V. All rights reserved.}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @ARTICLE{Fraley2002, author = {Fraley, C. and Raftery, A. E.}, title = {Model-based clustering, discriminant analysis, and density estimation}, journal = {Journal of the American Statistical Association}, year = {2002}, volume = {97}, pages = {611-631}, number = {458}, abstract = {Cluster analysis is the automated search for groups of related observations in a dataset. Most clustering done in practice is based largely on heuristic but intuitively reasonable procedures, and most clustering methods available in commercial software are also of this type. However, there is little systematic guidance associated with these methods for solving important practical questions that arise in cluster analysis, such as how many clusters are there, which clustering method should be used, and how should outliers be handled. We review a general methodology for model-based clustering that provides a principled statistical approach to these issues. We also show that this can be useful for other problems in multivariate analysis, such as discriminant analysis and multivariate density estimation. We give examples from medical diagnosis, minefield detection, cluster recovery from noisy data, and spatial density estimation. Finally, we mention limitations of the methodology and discuss recent developments in model-based clustering for non-Gaussian data, high-dimensional datasets, large datasets, and Bayesian estimation.}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @TECHREPORT{Fraley2006, author = {Fraley, C. F. and Raftery, A. E.}, title = {Technical Report 504. MCLUST Version 3 for R: Normal mixture modeling and model-based clustering}, institution = {Department of Statistics, University of Washington}, year = {2006}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @ARTICLE{Dempster1977, author = {Dempster, A. P. and Laird, N. M. and Rubin, D. B.}, title = {Maximum Likelihood from Incomplete Data via the EM Algorithm}, journal = {Journal of the Royal Statistical Society. Series B (Methodological)}, year = {1977}, volume = {39}, pages = {1-38}, number = {1}, abstract = {A broadly applicable algorithm for computing maximum likelihood estimates from incomplete data is presented at various levels of generality. Theory showing the monotone behaviour of the likelihood and convergence of the algorithm is derived. Many examples are sketched, including missing value situations, applications to grouped, censored or truncated data, finite mixture models, variance component estimation, hyperparameter estimation, iteratively reweighted least squares and factor analysis.}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @ARTICLE{Ward1963, author = {Ward, Joe H., Jr.}, title = {Hierarchical Grouping to Optimize an Objective Function}, journal = {Journal of the American Statistical Association}, year = {1963}, volume = {58}, pages = {236-244}, number = {301}, abstract = {A procedure for forming hierarchical groups of mutually exclusive subsets, each of which has members that are maximally similar with respect to specified characteristics, is suggested for use in large-scale ($n > 100$) studies when a precise optimal solution for a specified number of groups is not practical. Given n sets, this procedure permits their reduction to n - 1 mutually exclusive sets by considering the union of all possible n(n - 1)/2 pairs and selecting a union having a maximal value for the functional relation, or objective function, that reflects the criterion chosen by the investigator. By repeating this process until only one group remains, the complete hierarchical structure and a quantitative estimate of the loss associated with each stage in the grouping can be obtained. A general flowchart helpful in computer programming and a numerical example are included.}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @ARTICLE{Fowlkes1983, author = {Fowlkes, E. B. and Mallows, C. L.}, title = {A method for comparing 2 hierarchical clusterings}, journal = {Journal of the American Statistical Association}, year = {1983}, volume = {78}, pages = {553-569}, number = {383}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @ARTICLE{Morey1984, author = {Morey, L. C. and Agresti, A.}, title = {The measurement of classification agreement - an adjustment to the Rand statistic for chance agreement}, journal = {Educational and Psychological Measurement}, year = {1984}, volume = {44}, pages = {33-37}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @ARTICLE{Klastorin1985, author = {Klastorin, T. D.}, title = {The para-median problem for cluster-analysis - a comparative test using the mixture model approach}, journal = {Management Science}, year = {1985}, volume = {31}, pages = {84-95}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @ARTICLE{Hubert1985, author = {Hubert, L. and Arabie, P.}, title = {Comparing partitions}, journal = {Journal of Classification}, year = {1985}, volume = {2}, pages = {193-218}, number = {2-3}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @ARTICLE{Fisher1958, author = {Fisher, W. D.}, title = {On grouping for maximum homogeneity}, journal = {Journal of the American Statistical Association}, year = {1958}, volume = {53}, pages = {789-798}, number = {284}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @ARTICLE{Ball1967, author = {Ball, G. H. and Hall, D. J.}, title = {A clustering technique for summarizing multivariate data}, journal = {Behavioral Science}, year = {1967}, volume = {12}, pages = {153-\&}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @ARTICLE{Mantel1967, author = {Mantel, N.}, title = {Detection of disease clustering and a generalized regression approach}, journal = {Cancer Research}, year = {1967}, volume = {27}, pages = {209-\&}, number = {2P1}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @ARTICLE{Rubin1967, author = {Rubin, J.}, title = {Optimal classification into groups - an approach for solving taxonomy problem}, journal = {Journal of Theoretical Biology}, year = {1967}, volume = {15}, pages = {103-\&}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @ARTICLE{Rand1971, author = {Rand, W. M.}, title = {Objective criteria for evaluation of clustering methods}, journal = {Journal of the American Statistical Association}, year = {1971}, volume = {66}, pages = {846-\&}, number = {336}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @ARTICLE{Brennan1974, author = {Brennan, R. L. and Light, R. J.}, title = {Measuring agreement when 2 observers classify people into categories}, journal = {British Journal of Mathematical \& Statistical Psychology}, year = {1974}, volume = {27}, pages = {154-163}, number = {NOV}, owner = {rebeccaspeckman}, timestamp = {2009.01.25} } @ARTICLE{Chandra1977, author = {Chandra, S.}, title = {MIXTURES OF PROBABILITY DISTRIBUTIONS}, journal = {Scandinavian Journal of Statistics}, year = {1977}, volume = {4}, pages = {105-112}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.01.26} } @ARTICLE{Kasahara2009, author = {Kasahara, H. and Shimotsu, K.}, title = {Nonparametric Identification of Finite Mixture Models of Dynamic Discrete Choices}, journal = {Econometrica}, year = {2009}, volume = {77}, pages = {135-175}, number = {1}, abstract = {In dynamic discrete choice analysis, controlling for unobserved heterogeneity is an important issue, and finite mixture models provide flexible ways to account for it. This paper studies nonparametric identifiability of type probabilities and type-specific component distributions in finite mixture models of dynamic discrete choices. We derive sufficient conditions for nonparametric identification for various finite mixture models of dynamic discrete choices used in applied work under different assumptions on the Markov property, stationarity, and type-invariance in the transition process. Three elements emerge as the important determinants of identification: the time-dimension of panel data, the number of values the covariates can take, and the heterogeneity of the response of different types to changes in the covariates. For example, in a simple case where the transition function is type-invariant, a time-dimension of T = 3 is sufficient for identification, provided that the number of values the covariates can take is no smaller than the number of types and that the changes in the covariates induce sufficiently heterogeneous variations in the choice probabilities across types. Identification is achieved even when state dependence is present if a model is stationary first-order Markovian and the panel has a moderate time-dimension (T >= 6).}, owner = {rebeccaspeckman}, timestamp = {2009.01.26} } @ARTICLE{Yakowitz1968, author = {Yakowitz, S. J. and Spragins, J. D.}, title = {On identifiability of finite mixtures}, journal = {Annals of Mathematical Statistics}, year = {1968}, volume = {39}, pages = {209-\&}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.01.26} } @ARTICLE{Wolfe1970, author = {Wolfe, J. H.}, title = {Pattern clustering by multivariate mixture analysis}, journal = {Multivariate Behavioral Research}, year = {1970}, volume = {5}, pages = {329-350}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.01.26} } @ARTICLE{Teicher1963, author = {Teicher, H.}, title = {Identifiability of finite mixtures}, journal = {Annals of Mathematical Statistics}, year = {1963}, volume = {34}, pages = {1265-\&}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.01.26} } @ARTICLE{Teicher1961, author = {Teicher, H.}, title = {Identifiability of mixtures}, journal = {Annals of Mathematical Statistics}, year = {1961}, volume = {32}, pages = {244-248}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.01.26} } @ARTICLE{Stefankovic2007, author = {Stefankovic, D. and Vigoda, E.}, title = {Phylogeny of mixture models: Robustness of maximum likelihood and non-identifiable distributions}, journal = {Journal of Computational Biology}, year = {2007}, volume = {14}, pages = {156-189}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.01.26} } @ARTICLE{Morris2007, author = {Morris, J. N.}, title = {Uses of epidemiology (vol 2, pg 395-401, 1955)}, journal = {International Journal of Epidemiology}, year = {2007}, volume = {36}, pages = {1165-1172}, number = {6}, owner = {rebeccaspeckman}, timestamp = {2009.01.26} } @BOOK{Morris1975, title = {Uses of epidemiology}, publisher = {Churchill Livingstone : distributed in the U.S.A. by Longman}, year = {1975}, author = {Morris, Jeremy Noah}, address = {Edinburgh ; New York}, edition = {3d}, keywords = {Epidemiology. Epidemiology.}, owner = {rebeccaspeckman}, timestamp = {2009.01.26} } @ARTICLE{Morris1955, author = {Morris, J. N.}, title = {Uses of epidemiology}, journal = {British Medical Journal}, year = {1955}, volume = {2}, pages = {395-401}, number = {AUG13}, owner = {rebeccaspeckman}, timestamp = {2009.01.26} } @ELECTRONIC{CramerRule, author = {Weisstein, E. W.}, title = {Cramer's Rule.}, organization = {MathWorld--A Wolfram Web Resource}, url = {http://mathworld.wolfram.com/CramersRule.html}, owner = {rebeccaspeckman}, timestamp = {2009.01.26} } @ELECTRONIC{singular, author = {Weisstein, E. W.}, title = {Singular Matrix.}, organization = {MathWorld--A Wolfram Web Resource}, url = {http://mathworld.wolfram.com/SingularMatrix.html}, owner = {rebeccaspeckman}, timestamp = {2009.01.26} } @INPROCEEDINGS{Park2004, author = {Park, J. W. and Logue, M. and Ni, J. and Cremer, J. and Segre, A. and Vieland, V.}, title = {Scientific visualization of multidimensional data: Genetic likelihood visualization}, booktitle = {International Conference on High Performance Computing and Its Applications}, year = {2004}, editor = {Zhang, W. Chen Z. Glowinski R. Tong W.}, pages = {403-408}, address = {Shanghai, PEOPLES R CHINA}, abstract = {Although many computer graphic technologies have been developed for visualizing multidimensional multivariate data, the scientific visualization used by research scientists to interpret genetics data is very promising technique. In this paper, we present our research in a scientific visualization on linkage analysis data to enhance the performance or the efficiency of genetic likelihood research.}, owner = {rebeccaspeckman}, timestamp = {2009.01.26} } @BOOK{Eliason1993, title = {Maximum likelihood estimation : logic and practice}, publisher = {Sage}, year = {1993}, author = {Eliason, Scott R.}, series = {Sage university papers series. Quantitative applications in the social sciences}, address = {Newbury Park, Calif.}, keywords = {Social sciences Statistical methods. Estimation theory.}, owner = {rebeccaspeckman}, timestamp = {2009.01.26} } @ARTICLE{Groeneveld1990, author = {Groeneveld, E. and Kovac, M.}, title = {A note on multiple solutions in multivariate restricted maximum-likelihood covariance component estimation}, journal = {Journal of Dairy Science}, year = {1990}, volume = {73}, pages = {2221-2229}, number = {8}, owner = {rebeccaspeckman}, timestamp = {2009.01.26} } @INPROCEEDINGS{Atwood1992, author = {Atwood, L. D. and Kammerer, C. M. and Mitchell, B. D.}, title = {Exploring the HDL likelihood surface}, booktitle = {Genetic Analysis Workshop 8: Issues in the Analysis of Complex Diseases and Their Risk Factors ( GAW8 )}, year = {1992}, pages = {641-645}, address = {Watsonville, Ca}, abstract = {Using random initial parameter estimates, three segregation analysis models of the inheritance of HDL2 in the Berkeley GAW8 data set were maximized 5000 times each. Initial parameter estimates were assumed to be uniformly distributed on intervals formed by parameter boundaries. The three models were unrestricted, environmental, and Mendelian regressive type A models. Likelihood ratio tests of the global maxima rejected the Mendelian model and accepted the environmental model. However, tests using local maxima accepted the Mendelian model and both rejected and accepted the environmental model. Patterns among the initial parameter estimates of convergent runs were examined to develop empirical rules to increase the frequency of convergence. These rules were tested using data on apoAI in the Berkeley GAW8 data set. (C) 1993 Wiley-Liss, Inc.}, owner = {rebeccaspeckman}, timestamp = {2009.01.26} } @BOOK{McCutcheon1987, title = {Latent class analysis}, publisher = {Sage Publications}, year = {1987}, author = {McCutcheon, Allan L.}, series = {Sage University papers series. Quantitative applications in the social sciences}, address = {Newbury Park}, keywords = {Latent structure analysis. Latent variables.}, owner = {rebeccaspeckman}, timestamp = {2009.01.27} } @BOOK{Hagenaars2002, title = {Applied latent class analysis}, publisher = {Cambridge University Press}, year = {2002}, author = {Hagenaars, Jacques A. and McCutcheon, Allan L.}, address = {Cambridge ; New York}, keywords = {Latent structure analysis. Latent variables.}, owner = {rebeccaspeckman}, timestamp = {2009.01.27} } @BOOK{Gordon1999, title = {Classification}, publisher = {Chapman \& Hall/CRC}, year = {1999}, author = {Gordon, A. D.}, series = {Monographs on statistics and applied probability}, address = {Boca Raton}, edition = {2nd}, keywords = {Discriminant analysis. Cluster analysis.}, owner = {rebeccaspeckman}, timestamp = {2009.01.28} } @BOOK{Gordon1981, title = {Classification : methods for the exploratory analysis of multivariate data}, publisher = {Chapman and Hall}, year = {1981}, author = {Gordon, A. D.}, series = {Monographs on applied probability and statistics}, address = {London ; New York}, keywords = {Discriminant analysis. Cluster analysis.}, owner = {rebeccaspeckman}, timestamp = {2009.01.28} } @INBOOK{Bock1996, chapter = {Probability models and hypothesis testing in partitioning cluster analysis}, pages = {377-453}, title = {Clustering and Classification}, publisher = {World Scientific Publ.}, year = {1996}, editor = {Arabie P., Hubert L. J., De Soete G.}, author = {Bock, H. H.}, owner = {rebeccaspeckman}, timestamp = {2009.01.28} } @ARTICLE{Espeland1989, author = {Espeland, M. A. and Handelman, S. L.}, title = {Using latent class models to characterize and assess relative error in discrete measurements}, journal = {Biometrics}, year = {1989}, volume = {45}, pages = {587-599}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Ganesalingam1989, author = {Ganesalingam, S.}, title = {Classification and mixture approaches to clustering via maximum-likelihood}, journal = {Applied Statistics-Journal of the Royal Statistical Society Series C}, year = {1989}, volume = {38}, pages = {455-466}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Trauwaert1991, author = {Trauwaert, E. and Kaufman, L. and Rousseeuw, P.}, title = {Fuzzy clustering algorithms based on the maximum-likelihood principle}, journal = {Fuzzy Sets and Systems}, year = {1991}, volume = {42}, pages = {213-227}, number = {2}, abstract = {A number of hard clustering algorithms have been shown to be derivable from the maximum likelihood principle. The only corresponding fuzzy algorithm are the well known fuzzy k-means or FUZZY ISODATA of Dunn and its generalizations by Bezdek and by Gustafson and Kessel. The authors show how to generate two other fuzzy algorithms which are the analogous of known hard algorithms: the minimization of the fuzzy determinant and of the product of fuzzy determinants. By comparison between the hard and fuzzy methods it appears that the latter yield more often the global optimum, rather than merely a local optimum. This result and the comparison between the different algorithms, together with their specific domains of application, are illustrated by a few numerical examples.}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Celeux1991, author = {Celeux, G. and Govaert, G.}, title = {Clustering criteria for discrete-data and latent class models}, journal = {Journal of Classification}, year = {1991}, volume = {8}, pages = {157-176}, number = {2}, abstract = {We show that a well-known clustering criterion for discrete data, the information criterion, is closely related to the classification maximum likelihood criterion for the latent class model. This relation can be derived from the Bryant-Windham construction. Emphasis is placed on binary clustering criteria which are analyzed under the maximum likelihood approach for different multivariate Bernoulli mixtures. This alternative form of criterion reveals non-apparent aspects of clustering techniques. All the criteria discussed can be optimized with the alternating optimization algorithm. Some illustrative applications are included.}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Arabie1992, author = {Arabie, P. and Hubert, L. J.}, title = {Combinatorial data analysis}, journal = {Annual Review of Psychology}, year = {1992}, volume = {43}, pages = {169-203}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Celeux1992, author = {Celeux, G. and Govaert, G.}, title = {A classification EM algorithm for clustering and 2 stochastic versions}, journal = {Computational Statistics \& Data Analysis}, year = {1992}, volume = {14}, pages = {315-332}, number = {3}, abstract = {Setting the optimization-based clustering methods under the classification maximum likelihood approach, we define and study a general Classification EM algorithm. Then, we derive from this algorithm two stochastic algorithms, incorporating random perturbations, to reduce the initial-position dependence of the classical optimization clustering algorithms. Numerical experiments, reported for the variance criterion, show that both stochastic algorithms perform well compared with the standard k-means algorithm which is a particular version of the Classification EM algorithm.}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Yang1993, author = {Yang, M. S.}, title = {On a class of fuzzy classification maximum-likelihood procedures}, journal = {Fuzzy Sets and Systems}, year = {1993}, volume = {57}, pages = {365-375}, number = {3}, abstract = {Classification Maximum Likelihood (CML) procedure is a remarkable mixture of maximum likelihood approach to clustering. This has been well documented in the book of McLachlan and Basford, In this paper, we make the fuzzy extension of the CML procedure. Based on this class of fuzzy CML procedures, we extend the fuzzy clustering algorithms of Trauwaert, Kaufman and Rousseeuw by adding a penalty term. Especially, we derive a generalized type of fuzzy c-means (FCM) clustering algorithms, called the penalized FCM clustering algorithms. Then we create some asymptotic behaviors of these penalized FCM procedures. By doing some numerical examples we find that the penalized FCM is more meaningful and effective than FCM.}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Qu1996, author = {Qu, Y. S. and Tan, M. and Kutner, M. H.}, title = {Random effects models in latent class analysis for evaluating accuracy of diagnostic tests}, journal = {Biometrics}, year = {1996}, volume = {52}, pages = {797-810}, number = {3}, abstract = {When the results of a reference (or gold standard) test are missing or not error-free, the accuracy of diagnostic tests is often assessed through latent class models with two latent classes, representing diseased or nondiseased status. Such models, however, require that conditional on the true disease status, the tests are statistically independent, an assumption often violated in practice. Consequently, the model generally fits the data poorly. In this paper, we develop a general latent class model with random effects to model the conditional dependence among multiple diagnostic tests (or readers). We also develop a graphical method for checking whether or not the conditional dependence is of concern and for identifying the pattern of the correlation. Using the random-effects model and the graphical method, a simple adequate model that is easy to interpret can be obtained. The methods are illustrated with three examples from the biometric literature. The proposed methodology is also applicable when the true disease status is indeed known and conditional dependence could well be present.}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Celeux1996, author = {Celeux, G. and Soromenho, G.}, title = {An entropy criterion for assessing the number of clusters in a mixture model}, journal = {Journal of Classification}, year = {1996}, volume = {13}, pages = {195-212}, number = {2}, abstract = {In this paper, we consider an entropy criterion to estimate the number of clusters arising from a mixture model. This criterion is derived from a relation linking the likelihood and the classification likelihood of a mixture. Its performance is investigated through Monte Carlo experiments, and it shows favorable results compared to other classical criteria.}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Yang2005, author = {Yang, M. S. and Yu, N. Y.}, title = {Estimation of parameters in latent class models using fuzzy clustering algorithms}, journal = {European Journal of Operational Research}, year = {2005}, volume = {160}, pages = {515-531}, number = {2}, abstract = {A mixture approach to clustering is an important technique in cluster analysis. A mixture of multivariate multinormal distributions is usually used to analyze categorical data with latent class model. The parameter estimation is an important step for a mixture distribution. Described here are four approaches to estimating the parameters of a mixture of multivariate multinomial distributions. The first approach is an extended maximum likelihood (ML) method. The second approach is based on the well-known expectation maximization (EM) algorithm. The third approach is the classification maximum likelihood (CML) algorithm. In this paper, we propose a new approach using the so-called fuzzy class model and then create the fuzzy classification maximum likelihood (FCML) approach for categorical data. The accuracy, robustness and effectiveness of these four types of algorithms for estimating the parameters of multivariate binomial mixtures are compared using real empirical data and samples drawn from the multivariate binomial mixtures of two classes. The results show that the proposed FCML algorithm presents better accuracy, robustness and effectiveness. Overall, the FCML algorithm has the superiority over the ML, EM and CML algorithms. Thus, we recommend FCML as another good tool for estimating the parameters of mixture multivariate multinomial models. (C) 2003 Elsevier B.V. All rights reserved.}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Bock1996a, author = {Bock, H. H.}, title = {Probabilistic models in cluster analysis}, journal = {Computational Statistics \& Data Analysis}, year = {1996}, volume = {23}, pages = {5-28}, number = {1}, abstract = {This paper discusses cluster analysis in a probabilistic and inferential framework as opposed to more exploratory, heuristic or algorithmic approaches, It presents a broad survey on probabilistic models for partition-type, hierarchical and tree-like clustering structures and points to the relevant literature. It is shown how suitable clustering criteria or grouping methods may be derived from these models in the case of vector-valued data, dissimilarity matrices and similarity relations. In particular, we discuss hypothesis testing for homogeneity or for a grouping structure, the asymptotic distribution of test statistics, the use of random graph theory and combinatorial methods for simulating random dendrograms. Our presentation of hierarchies includes, e.g., Markovian branching processes and phylogenetic inference based on molecular sequence data.}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Huang2004, author = {Huang, G. H. and Bandeen-Roche, K.}, title = {Building an identifiable latent class model with covariate effects on underlying and measured variables}, journal = {Psychometrika}, year = {2004}, volume = {69}, pages = {5-32}, number = {1}, abstract = {In recent years, latent class models have proven useful for analyzing relationships between measured multiple indicators and covariates of interest. Such models summarize shared features of the multiple indicators as an underlying categorical variable, and the indicators' substantive associations with predictors are built directly and indirectly in unique model parameters. In this paper, we provide a detailed study on the theory and application of building models that allow mediated relationships between primary predictors and latent class membership, but that also allow direct effects of secondary covariates on the indicators themselves. Theory for model identification is developed. We detail an Expectation-Maximization algorithm for parameter estimation, standard error calculation, and convergent properties. Comparison of the proposed model with models underlying existing latent class modeling software is provided. A detailed analysis of how visual impairments affect older persons' functioning requiring distance vision is used for illustration.}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Reboussin2006, author = {Reboussin, B. A. and Song, E. Y. and Shrestha, A. and Lohman, K. K. and Wolfson, M.}, title = {A latent class analysis of underage problem drinking: Evidence from a community sample of 16-20 year olds}, journal = {Drug and Alcohol Dependence}, year = {2006}, volume = {83}, pages = {199-209}, number = {3}, abstract = {The aim of this paper is to shed light on the nature, of underage problem drinking by using an empirically based method to characterize the variation in patterns of drinking in a community sample of underage drinkers. A total of 4056 16-20-year-old current drinkers from 212 communities in the US were surveyed by telephone as part of the National Evaluation of the Enforcing Underage Drinking Laws (EUDL) Program. Latent class models were used to create homogenous groups of drinkers with similar drinking patterns defined by multiple indicators of drinking behaviors and alcohol-related problems. Two types of underage problem drinkers were identified; risky drinkers (30%) and regular drinkers (27%). The most prominent behaviors among both types of underage problem drinkers were binge drinking and getting drunk. Being male, other drug use, early onset drinking and beliefs about friends drinking and getting drunk were all associated with an increased risk of being a problem drinker after adjustment for other factors. Beliefs that most friends drink and current marijuana use were the strongest predictors of both risky problem drinking (OR = 4.0; 95% CI = 3.1, 5.1 and OR = 4.0; 95% CI = 2.8, 5.6, respectively) and regular problem drinking (OR = 10.8; 95% CI 7.0, 16.7 and OR = 10.2; 95% CI = 6.9, 15.2). Young adulthood (ages 18-20) was significantly associated with regular problem drinking but not risky problem drinking. The belief that most friends get drunk weekly was the strongest discriminator of risky and regular problem drinking patterns (OR = 5.3; 95% CI = 3.9, 7.1). These findings suggest that underage problem drinking is most strongly characterized by heavy drinking behaviors which can emerge in late adolescence and underscores its association with perceptions regarding friends drinking behaviors and illicit drug use. (c) 2005 Elsevier Ireland Ltd. All rights reserved.}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Reboussin2006b, author = {Reboussin, B. A. and Anthony, J. C.}, title = {Is there epidemiological evidence to support the idea that a cocaine dependence syndrome emerges soon after onset of cocaine use?}, journal = {Neuropsychopharmacology}, year = {2006}, volume = {31}, pages = {2055-2064}, number = {9}, abstract = {The present study uses latent class methods and multiple regression to shed light on hypothesized cocaine dependence syndromes experienced by community residents, who initiated cocaine use within 24 months of survey assessment, and explores possible variation in risk. Identified within public use data files from the United States National Household Surveys on Drug Abuse (NHSDA), and with assessments completed between 1995 and 1998, the study sample consists of 927 recent-onset cocaine users, defined as having initiated cocaine use no more than 24 months prior to assessment (approximate median elapsed time since onset of use similar to 12-13 months). The NHSDA included items to assess seven clinical features often associated with cocaine dependence, which were used in latent class modeling. Empirically derived latent classes, in conjunction with prior theory, tend to support a three-class solution, according to which 4% of recent-onset users are members of a class that resembles the DSM-IV cocaine dependence syndrome (mean: 5.4 clinical features (CF)); 16% might be in a cocaine dependence prodrome (mean: 2.4 CF); 80% of recent-onset cocaine users had few or no clinical features (mean < 1 CF). Results from latent class regressions indicate that susceptibility to rapid transition from first cocaine use to onset of the LCA-assigned cocaine dependence syndrome might depend upon whether the user starts smoking crack-cocaine and, independently, age at first cocaine use.}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Reboussin2006a, author = {Reboussin, B. A. and Lohman, K. K. and Wolfson, M.}, title = {Modeling adolescent drug use patterns in cluster-unit trials with multiple sources of correlation using robust latent class regressions}, journal = {Annals of Epidemiology}, year = {2006}, volume = {16}, pages = {850-859}, number = {11}, abstract = {PURPOSE: The purpose of the study is to examine variation in adolescent drug-use patterns by using latent class regression analysis and evaluate the properties of an estimating-equations approach under different cluster-unit trial designs. METHODS: A set of second-order estimating equations for latent class models under the cluster-unit trial design are proposed. This approach models the correlation within subclusters (drug-use behaviors), but ignores the correlation within clusters (communities). A robust covariance estimator is proposed that accounts for within-cluster correlation. Performance of this approach is addressed through a Monte Carlo simulation study, and practical implications are illustrated by using data from the National Evaluation of the Enforcing Underage Drinking Laws Randomized Community Trial. RESULTS: The example shows that the proposed method provides useful information about the heterogeneous nature of drug use by identifying two subtypes of adolescent problem drinkers. A Monte Carlo simulation study supports the proposed estimation method by suggesting that the latent class model parameters were unbiased for 30 or more clusters. Consistent with other studies of generalized estimating equation (GEE) estimators, the robust covariance estimator tended to underestimate the true variance of regression parameters, but the degree of inflation in the test size was relatively small for 70 clusters and only slightly inflated for 30 clusters. CONCLUSIONS: The proposed model for studying adolescent drug use provides an alternative to standard diagnostic criteria, focusing on the nature of the drug-use profile, rather than relying on univariate symptom counts. The second-order GEE-type estimation procedure provided a computationally feasible approach that performed well for a moderate number of clusters and was consistent with prior studies of GEE under the generalized linear model framework.}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Harper1972, author = {Harper, D.}, title = {Local dependence latent structure models}, journal = {Psychometrika}, year = {1972}, volume = {37}, pages = {53-\&}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Ganesalingam1980, author = {Ganesalingam, S. and McLachlan, G. J.}, title = {A comparison of the mixture and classification approaches to cluster analysis}, journal = {Communications in Statistics Part a-Theory and Methods}, year = {1980}, volume = {9}, pages = {923-933}, number = {9}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Suppes1981, author = {Suppes, P. and Zanotti, M.}, title = {When are probabilistic explanations possible}, journal = {Synthese}, year = {1981}, volume = {48}, pages = {191-199}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Windham1985, author = {Windham, M. P.}, title = {Numerical classification of proximity data with assignment measures}, journal = {Journal of Classification}, year = {1985}, volume = {2}, pages = {157-172}, number = {2-3}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Basford1985, author = {Basford, K. E. and McLachlan, G. J.}, title = {Likelihood estimation with normal mixture models}, journal = {Applied Statistics-Journal of the Royal Statistical Society Series C}, year = {1985}, volume = {34}, pages = {282-289}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Quinn1987, author = {Quinn, B. G. and McLachlan, G. J. and Hjort, N. L.}, title = {A note on the Aitkin-Rubin approach to hypothesis-testing in mixture-models}, journal = {Journal of the Royal Statistical Society Series B-Methodological}, year = {1987}, volume = {49}, pages = {311-314}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Hagenaars1988, author = {Hagenaars, J. A.}, title = {Latent structure models with direct effects between indicators - local dependence models}, journal = {Sociological Methods \& Research}, year = {1988}, volume = {16}, pages = {379-405}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Bensmail1996, author = {Bensmail, H. and Celeux, G.}, title = {Regularized Gaussian discriminant analysis through eigenvalue decomposition}, journal = {Journal of the American Statistical Association}, year = {1996}, volume = {91}, pages = {1743-1748}, number = {436}, abstract = {Friedman proposed a regularization technique (RDA) of discriminant analysis in the Gaussian framework. RDA uses two regularization parameters to design an intermediate classifier between the linear, the quadratic, and the nearest-means classifiers. In this article we propose an alternative approach, called EDDA, that is based on the reparameterization of the covariance matrix [Sigma(k)] of a group G(k) in terms of its eigenvalue decomposition Sigma(k) = lambda(k)D(k)A(k)D(k)', where lambda(k) specifies the volume of density contours of G(k), the diagonal matrix of eigenvalues specifies its shape, and the eigenvectors specify its orientation. Variations on constraints concerning volumes, shapes, and orientations lambda(k), A(k), and D-k lead to 14 discrimination models of interest. For each model, we derived the normal theory maximum likelihood parameter estimates. Our approach consists of selecting a model by minimizing the sample-based estimate of future misclassification risk by cross-validation. Numerical experiments on simulated and real data show favorable behavior of this approach compared to RDA.}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Bryant1988, author = {Bryant, P.}, title = {On characterizing optimization-based clustering methods}, journal = {Journal of Classification}, year = {1988}, volume = {5}, pages = {81-84}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Bensmail1997, author = {Bensmail, H. and Celeux, G. and Raftery, A. E. and Robert, C. P.}, title = {Inference in model-based cluster analysis}, journal = {Statistics and Computing}, year = {1997}, volume = {7}, pages = {1-10}, number = {1}, abstract = {A new approach to cluster analysis has been introduced based on parsimonious geometric modelling of the within-group covariance matrices in a mixture of multivariate normal distributions, using hierarchical agglomeration and iterative relocation. It works well and is widely used via the MCLUST software available in S-PLUS and StatLib. However, it has several limitations: there is no assessment of the uncertainty about the classification, the partition can be suboptimal, parameter estimates are biased, the shape matrix has to be specified by the user, prior group probabilities are assumed to be equal, the method for choosing the number of groups is based on a crude approximation, and no formal way of choosing between the various possible models is included. Here, we propose a new approach which overcomes all these difficulties. It consists of exact Bayesian inference via Gibbs sampling, and the calculation of Bayes factors (for choosing the model and the number of groups) from the output using the Laplace-Metropolis estimator. It works well in several real and simulated examples.}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Lin1997, author = {Lin, T. H. and Dayton, C. M.}, title = {Model selection information criteria for non-nested latent class models}, journal = {Journal of Educational and Behavioral Statistics}, year = {1997}, volume = {22}, pages = {249-264}, number = {3}, abstract = {Latent class models have been developed for assessment of hierarchic hierarchic relations in scaling and behavioral analysis. This article investigated the use of three model selection information criteria - Akaike AIC, Schwarz SIG, and Bozdogan CAIC - for non-nested models. In general, SIC and CAIC were superior to AIC for relatively simple models, whereas AIC was superior for more complex models, although accuracy was often quite low Sor such models. In addition, some effects were detected for error rates in the models.}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Reboussin1999, author = {Reboussin, B. A. and Liang, K. Y. and Reboussin, D. M.}, title = {Estimating equations for a latent transition model with multiple discrete indicators}, journal = {Biometrics}, year = {1999}, volume = {55}, pages = {839-845}, number = {3}, abstract = {This paper proposes a two-part model for studying transitions between health states over time when multiple, discrete health indicators are available. The includes a measurement model positing underlying latent health states and a transition model between latent health states over time. Full:maximum likelihood estimation procedures are computationally complex in this latent variable framework, making only a limited class of models feasible and estimation of standard errors problematic. For this reason, an estimating equations analogue of the pseudo-likelihood method for the parameters of interest, namely the transition model parameters, is considered. The finite sample properties of the proposed procedure are investigated through a simulation study and the importance of choosing strong indicators of the latent variable is demonstrated. The applicability of the methodology is illustrated with health survey data measuring disability in the elderly from the Longitudinal Study of Aging.}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Roeder1999, author = {Roeder, K. and Lynch, K. G. and Nagin, D. S.}, title = {Modeling uncertainty in latent class membership: A case study in criminology}, journal = {Journal of the American Statistical Association}, year = {1999}, volume = {94}, pages = {766-776}, number = {447}, abstract = {Social scientists are commonly interested in relating a latent trait (e.g., criminal tendency) to measurable individual covariates (e.g., poor parenting) to understand what defines or perhaps causes the latent trait. In this article we develop an efficient and convenient method for answering such questions. The basic model presumes that two types of variables have been measured: response variables (possibly longitudinal) that partially determine the latent class membership, and covariates or risk factors that we wish to relate to these latent class variables. The model assumes that these observable variables are conditionally independent, given the latent class variable. We use a mixture model for the joint distribution of the observables. We apply this model to a longitudinal dataset assembled as part of the Cambridge Study of Delinquent Development to test a fundamental theory of criminal development. This theory holds that crime is committed by two distinct groups within the population: adolescent-limited offenders and life-course-persistent offenders. As these labels suggest, the two groups are distinguished by the longevity of their offending careers. The theory also predicts that life-course-persistent offenders are disproportionately comprised of individuals born with neurological deficits and reared by caregivers without the skills and resources to effectively socialize a difficult child.}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Peel2000, author = {Peel, D. and McLachlan, G. J.}, title = {Robust mixture modelling using the t distribution}, journal = {Statistics and Computing}, year = {2000}, volume = {10}, pages = {339-348}, number = {4}, abstract = {Normal mixture models are being increasingly used to model the distributions of a wide variety of random phenomena and to cluster sets of continuous multivariate data. However, for a set of data containing a group or groups of observations with longer than normal tails or atypical observations, the use of normal components may unduly affect the fit of the mixture model. In this paper, we consider a more robust approach by modelling the data by a mixture of t distributions. The use of the ECM algorithm to fit this t mixture model is described and examples of its use are given in the context of clustering multivariate data in the presence of atypical observations in the form of background noise.}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Hennig2000, author = {Hennig, C.}, title = {Identifiability of models for clusterwise linear regression}, journal = {Journal of Classification}, year = {2000}, volume = {17}, pages = {273-296}, number = {2}, abstract = {Identifiability of the parameters is a necessary condition for the existence of consistent estimators. In this paper the identifiability of the parameters of models for data generated by different linear regression distributions with Gaussian errors is investigated. It turns out that such models cause other identifiability problems than do simple Gaussian mixtures. This problem was heretofore ignored; thus there are no satisfying consistency proofs in this area. Three different models are treated: Finite mixture models with random and fixed covariates and a fixed partition model. Counterexamples and sufficient conditions for identifiability are given, including an example for nonidentifiable parameters with an invertible information matrix. The model choice and the interpretation of the parameters are discussed as well as the use of the identifiability concept for fixed partition models. The concept is generalized to "partial identifiability".}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Reboussin2001, author = {Reboussin, B. A. and Anthony, J. C.}, title = {Latent class marginal regression models for modelling youthful drug involvement and its suspected influences}, journal = {Statistics in Medicine}, year = {2001}, volume = {20}, pages = {623-639}, number = {4}, abstract = {In longitudinal behavioural studies, it is common to have multiple categorical indicators for measuring a theoretical construct of interest. A latent class model is presented that accounts for the structure in a set of correlated, categorical variables measured at discrete time periods, drawing information from these Variables to form a smaller number of latent classes. The dependence of the resulting latent class model parameters on suspected factors over time is simultaneously modelled using a baseline-category logistic regression model. Estimation of the model parameters is achieved using an estimating equations procedure. A motivating example is provided from a longitudinal study of suspected linkages between monitoring or supervision by parents and the occurrence of drug use behaviours in an epidemiologic sample of school-attending youths. Copyright (C) 2001 John Wiley & Sons, Ltd.}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Lin2002, author = {Lin, H. Q. and Turnbull, B. W. and McCulloch, C. E. and Slate, E. H.}, title = {Latent class models for joint analysis of longitudinal biomarker and event process data: Application to longitudinal prostate-specific antigen readings and prostate cancer}, journal = {Journal of the American Statistical Association}, year = {2002}, volume = {97}, pages = {53-65}, number = {457}, abstract = {A retrospective substudy of the nutritional prevention of cancer (NPC) trials investigated the utility of longitudinally measured prostate-specific antigen (PSA) as a biomarker for subsequent onset of prostate cancer (PCa). Serial PSA levels were determined retrospectively from frozen blood samples that had been collected from all patients at successive clinic visits with the timing and the number of these visits highly variable. Diagnosis dates of all incident cases of PCa were recorded. Heterogeneity in PSA trajectories was observed that could not be fully explained by the usual linear mixed-effects model and measured covariates. Latent class models that incorporate both a longitudinal blomarker process and an event process offer a way to handle additional heterogeneity, to uncover distinct subpopulations. to incorporate correlated nonnormally distributed outcomes, and to classify individuals into risk classes, Our latent class joint model can aid the prediction of PCa probability given the longitudinal biomarker information available on an individual up to any date. The proposed model easily accommodates highly unbalanced longitudinal data and recurrent events. There are two levels of structure in the latent class joint model, First, the uncertainty of latent class membership is specified through a multinomial logistic model. Second, the class-specific marker trajectory and event process are specified parametrically and semiparametrically, under the assumption of conditional independence given the latent class membership. We use a likelihood approach to obtain parameter estimates via the EM algorithm. We fit the latent class joint model to the data from the NPC trials; four distinct subpopulations are identified that differ with regard to their PSA trajectories and risk for prostate cancer. Higher PSA level is significantly associated with increased risk of PCa, but appears to be conditionally independent once the latent classes are taken into account. Among the covariates, selenium supplementation and age at entry are statistically significant for various parts of the model. Assumptions-in particular the conditional independence between the longitudinal PSA blomarker and time to PCa diagnosis-are assessed.}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Ng2003, author = {Ng, S. K. and McLachlan, G. J.}, title = {An EM-based semi-parametric mixture model approach to the regression analysis of competing-risks data}, journal = {Statistics in Medicine}, year = {2003}, volume = {22}, pages = {1097-1111}, number = {7}, abstract = {We consider a mixture model approach to the regression analysis of competing-risks data. Attention is focused on inference concerning the effects of factors on both the probability of occurrence and the hazard rate conditional on each of the failure types. These two quantities are specified in the mixture model using the logistic model and the proportional hazards model, respectively. We propose a semi-parametric mixture method to estimate the logistic and regression coefficients jointly, whereby the component-baseline hazard functions are completely unspecified. Estimation is based on maximum likelihood on the basis of the full likelihood, implemented via an expectation-conditional maximization (ECM) algorithm. Simulation studies are performed to compare the performance of the proposed semi-parametric method with a fully parametric mixture approach. The results show that when the component-baseline hazard is monotonic increasing, the semi-parametric and fully parametric mixture approaches are comparable for mildly and moderately censored samples. When the component-baseline hazard is not monotonic increasing, the semi-parametric method consistently provides less biased estimates than a fully parametric approach and is comparable in efficiency in the estimation of the parameters for all levels of censoring. The methods are illustrated using a real data set of prostate cancer patients treated with different dosages of the drug diethylstilbestrol. Copyright (C) 2003 John Wiley Sons, Ltd.}, owner = {rebeccaspeckman}, timestamp = {2009.01.29} } @ARTICLE{Newton1994, author = {Newton, M. A. and Raftery, A. E. and Davison, A. C. and Bacha, M. and Celeux, G. and Carlin, B. P. and Clifford, P. and Lu, C. and Sherman, M. and Tanner, M. A. and Gelfand, A. E. and Mallick, B. K. and Gelman, A. and Grieve, A. P. and Kunsch, H. R. and Leonard, T. and Hsu, J. S. J. and Liu, J. S. and Rubin, D. B. and Lo, A. Y. and Louis, T. A. and Neal, R. M. and Owen, A. B. and Tu, D. S. and Gilks, W. R. and Roberts, G. and Sweeting, T. and Bates, D. and Ritter, G. and Worton, B. J. and Barnard, G. A. and Gibbens, R. and Silverman, B.}, title = {Approximate Bayesian-inference with the weighted likelihood bootstrap}, journal = {Journal of the Royal Statistical Society Series B-Methodological}, year = {1994}, volume = {56}, pages = {3-48}, number = {1}, abstract = {We introduce the weighted likelihood bootstrap (WLB) as a way to simulate approximately from a posterior distribution. This method is often easy to implement, requiring only an algorithm for calculating the maximum likelihood estimator, such as iteratively reweighted least squares. In the generic weighting scheme, the WLB is first order correct under quite general conditions. Inaccuracies can be removed by using the WLB as a source of samples in the sampling-importance resampling (SIR) algorithm, which also allows incorporation of particular prior information. The SIR-adjusted WLB can be a competitive alternative to other integration methods in certain models. Asymptotic expansions elucidate the second-order properties of the WLB, which is a generalization of Rubin's Bayesian bootstrap. The calculation of approximate Bayes factors for model comparison is also considered. We note that, given a sample simulated from the posterior distribution, the required marginal likelihood may be simulation consistently estimated by the harmonic mean of the associated likelihood values; a modification of this estimator that avoids instability is also noted. These methods provide simple ways of calculating approximate Bayes factors and posterior model probabilities for a very wide class of models.}, owner = {rebeccaspeckman}, timestamp = {2009.01.30} } @ARTICLE{Mkhadri1997, author = {Mkhadri, A. and Celeux, G. and Nasroallah, A.}, title = {Regularization in discriminant analysis: An overview}, journal = {Computational Statistics \& Data Analysis}, year = {1997}, volume = {23}, pages = {403-423}, number = {3}, abstract = {This paper presents an overview of regularized techniques in discriminant analysis. The case of continuous variables is treated first, and then the case of discrete variables. Three types of approaches are distinguished: combining standard methods, constraining models and Bayesian modelling. We include numerical experiments to assess the efficiency of regularized versions of predictive discrimination and to illustrate the superiority of regularization on variable subset selection in a small sample setting.}, owner = {rebeccaspeckman}, timestamp = {2009.01.30} } @ARTICLE{Li2006, author = {Li, T.}, title = {A unified view on clustering binary data}, journal = {Machine Learning}, year = {2006}, volume = {62}, pages = {199-215}, number = {3}, abstract = {Clustering is the problem of identifying the distribution of patterns and intrinsic correlations in large data sets by partitioning the data points into similarity classes. This paper studies the problem of clustering binary data. Binary data have been occupying a special place in the domain of data analysis. A unified view of binary data clustering is presented by examining the connections among various clustering criteria. Experimental studies are conducted to empirically verify the relationships.}, owner = {rebeccaspeckman}, timestamp = {2009.01.30} } @ARTICLE{Eaton1989a, author = {Eaton, W. W. and Dryman, A. and Sorenson, A. and McCutcheon, A.}, title = {DSM-III MAJOR DEPRESSIVE DISORDER IN THE COMMUNITY - A LATENT CLASS ANALYSIS OF DATA FROM THE NIMH EPIDEMIOLOGIC CATCHMENT-AREA PROGRAM}, journal = {British Journal of Psychiatry}, year = {1989}, volume = {155}, pages = {48-54}, owner = {rebeccaspeckman}, timestamp = {2009.01.30} } @ARTICLE{Clifford1993, author = {Clifford, P. and Jennison, C. and Wakefield, J. and Phillips, D. and Frigessi, A. and Gray, A. J. and Lawson, A. and Forster, J. and Ramgopal, P. and Arslan, O. and Constable, P. D. L. and Kent, J. T. and Wolff, R. C. L. and Harding, E. F. and Middleton, R. and Diggle, P. J. and Aykroyd, R. G. and Berzuini, C. and Brewer, M. and Aitken, C. and Celeux, G. and Diebolt, J. and Critchley, F. and Diaconis, P. and Rosenthal, J. S. and Diebolt, J. and Robert, C. P. and Gelfand, A. E. and Lee, T. M. and Gelman, A. and Rubin, D. B. and Geman, D. and Geweke, J. and Geyer, C. J. and Gigli, A. and Givens, G. H. and Goodall, C. and Jonalasinio, G. D. and Grieve, A. P. and Han, X. L. and Kolassa, J. E. and Tanner, M. A. and Kooperberg, C. and Lewis, S. M. and Lin, S. and Thompson, E. A. and Litton, C. D. and Buck, C. E. and Liu, C. H. and Liu, J. and Mardia, K. V. and Marriott, J. M. and Moller, J. and Raftery, A. E. and Shephard, N. and Sinha, D. and Sokal, A. D. and Titterington, D. M. and Wilson, J. D. and York, J. and Madigan, D. and Smith, A. F. M. and Roberts, G. O. and Besag, J. and Green, P. J. and Gilks, W. R. and Clayton, D. G. and Spiegelhalter, D. J.}, title = {Discussion on the meeting on the Gibbs sampler and other Markov chain Monte Carlo methods}, journal = {Journal of the Royal Statistical Society Series B-Methodological}, year = {1993}, volume = {55}, pages = {53-102}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.01.30} } @ARTICLE{Govaert2003, author = {Govaert, G. and Nadif, M.}, title = {Clustering with block mixture models}, journal = {Pattern Recognition}, year = {2003}, volume = {36}, pages = {463-473}, number = {2}, abstract = {Basing cluster analysis on mixture models has become a classical and powerful approach. Until now, this approach, which allows to explain some classic clustering criteria such as the well-known k-means criteria and to propose general criteria, has been developed to classify a set of objects measured on a set of variables. But, for this kind of data, if most clustering procedures are designated to construct an optimal partition of objects or, sometimes, of variables, there exist others methods, named block clustering methods, which consider simultaneously the two sets and organize the data into homogeneous blocks. In this work, a new mixture model called block mixture model is proposed to take into account this situation. This model allows to embed simultaneous clustering of objects and variables in a mixture approach. We first consider this probabilistic model in a general context and we develop a new algorithm of simultaneous partitioning based on the CEM algorithm. Then, we focus on the case of binary data and we show that our approach allows us to extend a block clustering method, which had been proposed in this case, Simplicity. fast convergence and the possibility to process large data sets are the major advantages of the proposed approach. (C) 2002 Pattern Recognition Society. Published by Elsevier Science Ltd. All rights reserved.}, owner = {rebeccaspeckman}, timestamp = {2009.01.30} } @ARTICLE{Govaert1996, author = {Govaert, G. and Nadif, M.}, title = {Comparison of the mixture and the classification maximum likelihood in cluster analysis with binary data}, journal = {Computational Statistics \& Data Analysis}, year = {1996}, volume = {23}, pages = {65-81}, number = {1}, abstract = {In this paper we propose to extend the comparison between the maximum likelihood and the classification maximum likelihood approaches for the Gaussian mixture (Ganesalingam, 1989; Celeux and Govaert, 1993) in the case of binary data, To this end, we use Bernoulli distribution mixtures. As with continuous data, two situations are discussed: first where mixing proportions are taken to be equal and secondly where they are unknown. The comparison realized with Monte-Carlo numerical experiments confirms the results obtained with continuous data. The choice of the approach depends on the size of the sample but assumptions about the mixing proportions are more important than the choice between the two approaches.}, owner = {rebeccaspeckman}, timestamp = {2009.01.30} } @INPROCEEDINGS{Celeux1991a, author = {Celeux, G. and Hebrail, G. and Mkhadri, A. and Suchard, M.}, title = {Reduction of a large-scale and ill-conditioned statistical problem on textual data}, booktitle = {5th International Symp on Applied Stochastic Models and Data Analysis}, year = {1991}, editor = {Gutierrez, R. Valderrama Mj}, pages = {129-137}, address = {Granada, Spain}, owner = {rebeccaspeckman}, timestamp = {2009.01.30} } @ARTICLE{Celeux2001, author = {Celeux, G. and Chretien, S. and Forbes, F. and Mkhadri, A.}, title = {A component-wise EM algorithm for mixtures}, journal = {Journal of Computational and Graphical Statistics}, year = {2001}, volume = {10}, pages = {697-712}, number = {4}, abstract = {Maximum likelihood estimation in finite mixture distributions is typically approached as an incomplete data problem to allow application of the expectation-maximization (EM) algorithm. In its general formulation, the EM algorithm involves the notion of a complete data space, in which the observed measurements and incomplete data are embedded. An advantage is that many difficult estimation problems are facilitated when viewed in this way. One drawback is that the simultaneous update used by standard EM requires overly informative complete data spaces, which leads to slow convergence in some situations. In the incomplete data context, it has been shown that the use of less informative complete data spaces, or equivalently smaller missing data spaces, can lead to faster convergence without sacrifying simplicity. However, in the mixture case, little progress has been made in speeding up EM. In this article we propose a component-wise EM for mixtures. It uses, at each iteration, the smallest admissible missing data space by intrinsically decoupling the parameter updates. Monotonicity is maintained, although the estimated proportions may not sum to one during the course of the iteration. However, we prove that the mixing proportions will satisfy this constraint upon convergence. Our proof of convergence relies on the interpretation of our procedure as a proximal point algorithm. For performance comparison, we consider standard EM as well as two other algorithms based on missing data space reduction, namely the SAGE and AECME algorithms. We provide adaptations of these general procedures to the mixture case. We also consider the ECME algorithm, which is not a data augmentation scheme but still aims at accelerating EM. Our numerical experiments illustrate the advantages of the component-wise EM algorithm relative to these other methods.}, owner = {rebeccaspeckman}, timestamp = {2009.01.30} } @ARTICLE{Celeux1996b, author = {Celeux, G. and Chauveau, D. and Diebolt, J.}, title = {Stochastic versions of the EM algorithm: An experimental study in the mixture case}, journal = {Journal of Statistical Computation and Simulation}, year = {1996}, volume = {55}, pages = {287-314}, number = {4}, abstract = {We compare three different stochastic versions of the EM algorithm: The Stochastic EM algorithm (SEM), the ''Simulated Annealing'' EM algorithm (SAEM) and the Monte Carlo EM algorithm (MCEM). We focus particularly on the mixture of distributions problem. In this context, we investigate the practical behaviour of these algorithms through intensive Monte Carlo numerical simulations and a real data study. We show that, for some particular mixture situations, the SEM algorithm is almost always preferable to the EM and ''simulated annealing'' versions SAEM and MCEM. For some severely overlapping mixtures, however, none of these algorithms can be confidently used. Then, SEM can be used as an efficient data exploratory tool for locating significant maxima of the likelihood function. In the real data case, we show that the SEM stationary distribution provides a contrasted view of the loglikelihood by emphasizing sensible maxima.}, owner = {rebeccaspeckman}, timestamp = {2009.01.30} } @ARTICLE{Biernacki2006, author = {Biernacki, C. and Celeux, G. and Govaert, G. and Langrognet, F.}, title = {Model-based cluster and discriminant analysis with the MIXMOD software}, journal = {Computational Statistics \& Data Analysis}, year = {2006}, volume = {51}, pages = {587-600}, number = {2}, abstract = {The Mixture Modeling (MIXMOD) program fits mixture models to a given data set for the purposes of density estimation, clustering or discriminant analysis. A large variety of algorithms to estimate the mixture parameters are proposed (EM, Classification EM, Stochastic EM), and it is possible to combine these to yield different strategies for obtaining a sensible maximum for the likelihood (or complete-data likelihood) function. MIXMOD is currently intended to be used for multivariate Gaussian mixtures, and fourteen different Gaussian models can be distinguished according to different assumptions regarding the component variance matrix eigenvalue decomposition. Moreover, different information criteria for choosing a parsimonious model (the number of mixture components, for instance) are included, their suitability depending on the particular perspective (cluster analysis or discriminant analysis). Written in C++, MIXMOD is interfaced with SCILAB and MATLAB. The program, the statistical documentation and the user guide are available on the internet at the following address: http://www-math.univ-fcomte.fr/mixmod/index.php. (c) 2006 Elsevier B.V. All rights reserved.}, owner = {rebeccaspeckman}, timestamp = {2009.01.30} } @ARTICLE{Biernacki2003, author = {Biernacki, C. and Celeux, G. and Govaert, G.}, title = {Choosing starting values for the EM algorithm for getting the highest likelihood in multivariate Gaussian mixture models}, journal = {Computational Statistics \& Data Analysis}, year = {2003}, volume = {41}, pages = {561-575}, number = {3-4}, abstract = {Simple methods to choose sensible starting values for the EM algorithm to get maximum likelihood parameter estimation in mixture models are compared. They are based on random initialization, using a classification EM algorithm (CEM), a Stochastic EM algorithm (SEM) or previous short runs of EM itself. Those initializations are included in a search/run/select strategy which can be compounded by repeating the three steps. They are compared in the context of multivariate Gaussian mixtures on the basis of numerical experiments on both simulated and real data sets in a target number of iterations. The main conclusions of those numerical experiments are the following. The simple random initialization which is probably the most employed way of initiating EM is often outperformed by strategies using CEM, SEM or shorts runs of EM before running EM. Also, it appears that compounding is generally profitable since using a single run of EM can often lead to suboptimal solutions. Otherwise, none of the experimental strategies can be regarded as the best one and it is difficult to characterize situations where a particular strategy can be expected to outperform the other ones. However, the strategy initiating EM with short runs of EM can be recommended. This strategy, which as far as we know was not used before the present study, has some advantages. It is simple, performs well in a lot of situations presupposing no particular form of the mixture to be fitted to the data and seems little sensitive to noisy data. (C) 2002 Elsevier Science B.V. All rights reserved.}, owner = {rebeccaspeckman}, timestamp = {2009.01.30} } @ARTICLE{Biernacki2000, author = {Biernacki, C. and Celeux, G. and Govaert, G.}, title = {Assessing a mixture model for clustering with the integrated completed likelihood}, journal = {Ieee Transactions on Pattern Analysis and Machine Intelligence}, year = {2000}, volume = {22}, pages = {719-725}, number = {7}, abstract = {We propose assessing a mixture model in a cluster analysis setting with the integrated completed likelihood. With this purpose, the observed data are assigned to unknown clusters using a maximum a posteriori operator. Then, the Integrated Completed Likelihood (ICL) is approximated using an a` la Bayesian information criterion (BIC). Numerical experiments on simulated and real data of the resulting ICL criterion show that it performs well both for choosing a mixture model and a relevant number of clusters. In particular. ICL appears to be more robust than BIC to violation of some of the mixture model assumptions and it can select a number of clusters leading to a sensible partitioning of the data.}, owner = {rebeccaspeckman}, timestamp = {2009.01.30} } @ARTICLE{Sneath1962, author = {Sneath, P. H. A. and Sokal, R. R.}, title = {NUMERICAL TAXONOMY}, journal = {Nature}, year = {1962}, volume = {193}, pages = {855-\&}, number = {4818}, owner = {rebeccaspeckman}, timestamp = {2009.01.30} } @ARTICLE{Robert1997, author = {Robert, C. P. and Aitkin, M. and Cox, D. R. and Stephens, M. and Polymenis, A. and Gilks, W. R. and Nobile, A. and Hodgson, M. and Ohagan, A. and Longford, N. T. and Dawid, A. P. and Atkinson, A. C. and Bernardo, J. M. and Besag, J. and Brooks, S. P. and Byers, S. and Raftery, A. and Celeux, G. and Cheng, R. C. H. and Liu, W. B. and Chien, Y. H. and George, E. I. and Cressie, N. and Huang, H. C. and Gruet, M. A. and Heath, S. C. and Jennison, C. and Lawson, A. B. and Clark, A. and McLachlan, G. and Peel, D. and Mengersen, K. and George, A. and Philippe, A. and Roeder, K. and Wasserman, L. and Schlattmann, P. and Bohning, D. and Titterington, D. M. and Tong, H. and West, M.}, title = {On Bayesian analysis of mixtures with an unknown number of components - Discussion}, journal = {Journal of the Royal Statistical Society Series B-Statistical Methodology}, year = {1997}, volume = {59}, pages = {758-792}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.01.30} } @ARTICLE{Redner1984, author = {Redner, R. A. and Walker, H. F.}, title = {Mixture densities, maximum likelihood and the EM algorithm}, journal = {SIAM Review}, year = {1984}, volume = {26}, pages = {195-237}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.01.30} } @ARTICLE{Qian2006, author = {Qian, Zhiguang and Shapiro, Alexander}, title = {Simulation-based approach to estimation of latent variable models}, journal = {Computational Statistics \& Data Analysis}, year = {2006}, volume = {51}, pages = {1243-1259}, number = {2}, note = {0167-9473 doi: DOI: 10.1016/j.csda.2006.02.016}, abstract = {We propose a simulation-based method for calculating maximum likelihood estimators in latent variable models. The proposed method integrates a recently developed sampling strategy, the so-called Sample Average Approximation method, to efficiently compute high quality solutions of the estimation problem. Theoretical and algorithmic properties of the method are discussed. A computational study, involving two numerical examples, is presented to highlight a significant improvement of the proposed approach over existing methods.}, keywords = {Simulation-based inference Latent variable models Stochastic programming Sample average approximation}, owner = {rebeccaspeckman}, timestamp = {2009.01.31} } @ARTICLE{Biernacki2006a, author = {Biernacki, Christophe and Celeux, Gilles and Govaert, GÈrard and Langrognet, Florent}, title = {Model-based cluster and discriminant analysis with the MIXMOD software}, journal = {Computational Statistics \& Data Analysis}, year = {2006}, volume = {51}, pages = {587-600}, number = {2}, note = {0167-9473 doi: DOI: 10.1016/j.csda.2005.12.015}, abstract = {The Mixture Modeling (MIXMOD) program fits mixture models to a given data set for the purposes of density estimation, clustering or discriminant analysis. A large variety of algorithms to estimate the mixture parameters are proposed (EM, Classification EM, Stochastic EM), and it is possible to combine these to yield different strategies for obtaining a sensible maximum for the likelihood (or complete-data likelihood) function. MIXMOD is currently intended to be used for multivariate Gaussian mixtures, and fourteen different Gaussian models can be distinguished according to different assumptions regarding the component variance matrix eigenvalue decomposition. Moreover, different information criteria for choosing a parsimonious model (the number of mixture components, for instance) are included, their suitability depending on the particular perspective (cluster analysis or discriminant analysis). Written in C++, MIXMOD is interfaced with SCILAB and MATLAB. The program, the statistical documentation and the user guide are available on the internet at the following address: http://www-math.univ-fcomte.fr/mixmod/index.php.}, keywords = {Gaussian models EM-like algorithms Model selection}, owner = {rebeccaspeckman}, timestamp = {2009.01.31} } @ARTICLE{Leisch2006, author = {Leisch, Friedrich}, title = {A toolbox for {$K$}-centroids cluster analysis}, journal = {Computational Statistics \& Data Analysis}, year = {2006}, volume = {51}, pages = {526-544}, number = {2}, note = {DOI: 10.1016/j.csda.2005.10.006}, abstract = {A methodological and computational framework for centroid-based partitioning cluster analysis using arbitrary distance or similarity measures is presented. The power of high-level statistical computing environments like R enables data analysts to easily try out various distance measures with only minimal programming effort. A new variant of centroid neighborhood graphs is introduced which gives insight into the relationships between adjacent clusters. Artificial examples and a case study from marketing research are used to demonstrate the influence of distances measures on partitions and usage of neighborhood graphs.}, keywords = {Cluster analysis Distance measures R}, owner = {rebeccaspeckman}, timestamp = {2009.01.31} } @ARTICLE{Lauro1996, author = {Lauro, Carlo}, title = {Computational statistics or statistical computing, is that the question?}, journal = {Computational Statistics \& Data Analysis}, year = {1996}, volume = {23}, pages = {191-193}, number = {1}, note = {0167-9473 doi: DOI: 10.1016/0167-9473(96)88920-1}, owner = {rebeccaspeckman}, timestamp = {2009.01.31} } @ARTICLE{Rousseeuw1996, author = {Rousseeuw, P. J. and Kaufman, L. and Trauwaert, E.}, title = {Fuzzy clustering using scatter matrices}, journal = {Computational Statistics \& Data Analysis}, year = {1996}, volume = {23}, pages = {135-151}, number = {1}, note = {0167-9473 doi: DOI: 10.1016/S0167-9473(96)00026-6}, abstract = {Starting from the well-known fuzzy k-means method, which was mainly intended for spherical clusters, several methods are considered which incorporate cluster-specific scatter matrices. This enables them to describe elliptical clusters with different orientation. The distinction between these methods lies in the way they deal with clusters of different volume, cardinality, and density. Some industrial examples show that different applications may lead to different goals and preferences, which affect the choice of the clustering method.}, keywords = {Ellipsoidal clusters Fuzzy clustering Industrial applications Maximum likelihood SAND method}, owner = {rebeccaspeckman}, timestamp = {2009.01.31} } @ARTICLE{Hartigan1996, author = {Hartigan, J. A.}, title = {Recognition}, journal = {Computational Statistics \& Data Analysis}, year = {1996}, volume = {23}, pages = {97-103}, number = {1}, note = {0167-9473 doi: DOI: 10.1016/S0167-9473(96)00023-0}, abstract = {In making a prediction, we divide our attention between objects presently perceived and previously experienced objects. The present objects are recognised as similar to objects previously experienced, and the qualities remembered from previous examination are predicted for the present objects. Prediction is fallible, in that we may make errors in recognising the present object, or in past observations of the experienced object, or in assigning qualities to the present object which hold for the experienced objects similar to the present object, but not for the present object. We use probability to quantify these errors. We use classification in organizing our experiences, and in recognising present objects as being similar to some species of experienced objects.}, owner = {rebeccaspeckman}, timestamp = {2009.01.31} } @ARTICLE{VanCutsem1996, author = {Van Cutsem, Bernard}, title = {Combinatorial structures and structures for classification}, journal = {Computational Statistics \& Data Analysis}, year = {1996}, volume = {23}, pages = {169-188}, number = {1}, note = {0167-9473 doi: DOI: 10.1016/S0167-9473(96)00028-X}, abstract = {The problem of random generation of structures of classification, such as partitions or various kinds of hierarchies, is approached in two different ways. On the one hand, structures of classification are considered as particular combinatorial structures and then they can be randomly generated using the calculus introduced by Flajolet, Zimmermann and Van Cutsem (1993). On the other hand, some Markov or renewal process models can be introduced to describe some of these structures and can be used to generate such random structures, according to some ideas of Van Cutsem and Ycart (1993). Some asymptotics properties allow one to simplify the procedures. These last results are presented in the example of stratified hierarchies. It is clear that these techniques are mainly useful for the problem of the classifiability of data.}, keywords = {Combinatorial structures operations and specifications, uniform random generation Structures of classification, partitions, hierarchies Markov chains}, owner = {rebeccaspeckman}, timestamp = {2009.01.31} } @ARTICLE{Hardy1996, author = {Hardy, AndrÈ}, title = {On the number of clusters}, journal = {Computational Statistics \& Data Analysis}, year = {1996}, volume = {23}, pages = {83-96}, number = {1}, note = {0167-9473 doi: DOI: 10.1016/S0167-9473(96)00022-9}, abstract = {A large number of classification and clustering methods for defining and calculating optimal or well-suited partitions for data sets are available. Perhaps the most difficult problem facing the user of cluster analysis techniques in practice is the objective assessment of the stability and validity of the clusters found by the numerical technique used. The problem of determining the "true" number of clusters has been called the fundamental problem of cluster validity. The aim of this paper is to compare three methods based on the hypervolume criterion with other well-known methods. To illustrate and compare their behaviour, these procedures for determining the number of clusters are applied to artificially constructed bivariate data containing various types of structure. To provide a variety of solutions six clustering methods are used. We finally conclude by pointing out the performance of each method and by giving some recommendations to help potential users of these techniques.}, keywords = {Cluster analysis Hypervolume criterion Number of clusters}, owner = {rebeccaspeckman}, timestamp = {2009.01.31} } @ARTICLE{Hartigan1996a, author = {Hartigan, J. A.}, title = {Introduction: {C}lassification, probability and statistics}, journal = {Computational Statistics \& Data Analysis}, year = {1996}, volume = {23}, pages = {3-4}, number = {1}, note = {0167-9473 doi: DOI: 10.1016/S0167-9473(96)00017-5}, owner = {rebeccaspeckman}, timestamp = {2009.01.31} } @ARTICLE{Rasson1996, author = {Rasson, J. P. and Rousseeuw, P. J.}, title = {Preface}, journal = {Computational Statistics \& Data Analysis}, year = {1996}, volume = {23}, pages = {1-1}, number = {1}, note = {0167-9473 doi: DOI: 10.1016/S0167-9473(96)90279-0}, owner = {rebeccaspeckman}, timestamp = {2009.01.31} } @BOOK{Hartigan1975, title = {Clustering algorithms}, publisher = {Wiley}, year = {1975}, author = {Hartigan, John A.}, address = {New York,}, keywords = {Cluster analysis. Cluster analysis Data processing.}, owner = {rebeccaspeckman}, timestamp = {2009.01.31} } @INBOOK{McCutcheon2002, chapter = {Basic concepts and procedures in single- and multiple-group latent class analysis}, pages = {56-85}, title = {Applied latent class analysis}, publisher = {Cambridge University Press}, year = {2002}, editor = {Hagenaars, Jacques A. and McCutcheon, Allan L.}, author = {McCutcheon, Allan L.}, owner = {rebeccaspeckman}, timestamp = {2009.01.31} } @INBOOK{Vermunt2002, chapter = {Latent class cluster analysis}, pages = {89-106}, title = {Applied latent class analysis}, publisher = {Cambridge University Press}, year = {2002}, editor = {Hagenaars, Jacques A. and McCutcheon, Allan L.}, author = {Vermunt, Jeroen K. and Magidson, Jay}, owner = {rebeccaspeckman}, timestamp = {2009.01.31} } @ARTICLE{Goodman1974, author = {Goodman, Leo A.}, title = {The Analysis of Systems of Qualitative Variables When Some of the Variables Are Unobservable. Part I-A Modified Latent Structure Approach}, journal = {The American Journal of Sociology}, year = {1974}, volume = {79}, pages = {1179-1259}, number = {5}, note = {00029602 ArticleType: primary_article / Full publication date: Mar., 1974 / Copyright © 1974 The University of Chicago Press}, abstract = {This article presents methods for analyzing the relationships among a set of qualitative variables when some of these variables are specified manifest (i.e., observed) variables and others are latent (i.e., unobserved or unobservable) variables. We shall show how to estimate the magnitude of the various effects represented in pathdiagram models that include both the manifestand latent variables, and also how to test whether this kind of path-diagram model is congruent with the observed data. These methods can be applied in order to analyze data obtained in various kinds of surveys (including panel studies), and also in order to construct tests and indices for purposes of measurement and prediction. To illustrate their wide applicability and flexibility, we shall use these methods to reanalyze several different sets of data which were analyzed earlier by Coleman (1964), Lazarsfeld (1948, 1970), Goodman (1973a), and others. Except for some related conclusions in Goodman (1973a), the methods introduced herein lead to conclusions that are very different from those presented by the other researchers who had analyzed these data earlier.}, owner = {rebeccaspeckman}, timestamp = {2009.02.01} } @ARTICLE{Clogg1981, author = {Clogg, Clifford C.}, title = {Latent Structure Models of Mobility}, journal = {The American Journal of Sociology}, year = {1981}, volume = {86}, pages = {836-868}, number = {4}, note = {00029602 ArticleType: primary_article / Full publication date: Jan., 1981 / Copyright © 1981 The University of Chicago Press}, abstract = {This paper proposes several latent structure models for the analysis of mobility tables and examines the relationship of these to some earlier mobility models (e.g., the "perfect" and the "quasi-perfect" mobility models). Data from the classic Danish (5 X 5) and British (5 X 5 and 8 X 8) mobility tables are used to illustrate the utility of these methods in comparative analysis. A model designated as a quasi-latent structure is suggested as a plausible rendering of the structure of mobility for each set of data, and this model is used to derive various kinds of substantive inferences.}, owner = {rebeccaspeckman}, timestamp = {2009.02.03} } @ARTICLE{Baulieu1989, author = {Baulieu, F. B.}, title = {A CLASSIFICATION OF PRESENCE ABSENCE BASED DISSIMILARITY COEFFICIENTS}, journal = {Journal of Classification}, year = {1989}, volume = {6}, pages = {233-246}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.02.27} } @ARTICLE{Baulieu1997, author = {Baulieu, F. B.}, title = {Two variant axiom systems for presence/absence based dissimilarity coefficients}, journal = {Journal of Classification}, year = {1997}, volume = {14}, pages = {159-170}, number = {1}, abstract = {The axiomatic treatment of presence/absence based dissimilarity coefficients presented by Baulieu (1989) is extended. Independence of the axiom sets is established. Global order equivalence is examined as well as its relation to the axiom sets for two variations of the model. It is shown that if paired absences are to be ignored in the calculation of dissimilarity values, then one variant model admits only a single dissimilarity coefficient (up to global order equivalence): (a + b)/(a + b + c).}, owner = {rebeccaspeckman}, timestamp = {2009.02.27} } @CONFERENCE{MacQueen1967, author = {MacQueen}, title = {Some methods for classification and analysis of multivariate observations.}, booktitle = {Proceedings of the Fifth Berkeley Symposium on Mathematical Statistics and Probability}, year = {1967}, pages = {281-297}, owner = {rebeccaspeckman}, timestamp = {2009.03.03} } @INPROCEEDINGS{Guolo2006, author = {Guolo, A. and Brazzale, A. R.}, title = {Empirical evaluation of measurement error correction techniques for matched case-control studies}, booktitle = {ISEE/ISEA 2006 Conference}, year = {2006}, pages = {S437-S437}, address = {Paris, FRANCE}, owner = {rebeccaspeckman}, timestamp = {2009.03.03} } @ARTICLE{Brusco2004, author = {Brusco, M. J.}, title = {Clustering binary data in the presence of masking variables}, journal = {Psychological Methods}, year = {2004}, volume = {9}, pages = {510-523}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.03.03} } @BOOK{Hubert2006, title = {The structural representation of proximity matrices with MATLAB}, publisher = {Society for Industrial and Applied Mathematics ; American Statistical Association}, year = {2006}, author = {Hubert, Lawrence J. and Arabie, Phipps and Meulman, Jacqueline}, series = {ASA-SIAM series on statistics and applied probability}, address = {Philadelphia, Pa. Alexandria, Va.}, keywords = {Proximity matrices. Functions. Representations of graphs.}, owner = {rebeccaspeckman}, timestamp = {2009.03.03} } @BOOK{Arabie1996a, title = {Clustering and classification}, publisher = {World Scientific}, year = {1996}, author = {Arabie, Phipps and Hubert, Lawrence J. and Soete, Geert de}, address = {Singapore ; River Edge, NJ}, keywords = {Cluster analysis. Discriminant analysis.}, owner = {rebeccaspeckman}, timestamp = {2009.03.03} } @INBOOK{Arabie1996, chapter = {An overview of combinatorial data analysis}, pages = {5--63}, title = {Clustering and classification}, publisher = {World Scientific}, year = {1996}, editor = {Arabie, Phipps and Hubert, Lawrence J. and Soete, Geert de}, author = {Arabie, Phipps and Hubert, Lawrence J.}, address = {Singapore ; River Edge, NJ}, keywords = {Cluster analysis. Discriminant analysis.}, owner = {rebeccaspeckman}, timestamp = {2009.03.03} } @ARTICLE{Sheng2006, author = {Sheng, W. G. and Liu, X. H.}, title = {A genetic k-medoids clustering algorithm}, journal = {Journal of Heuristics}, year = {2006}, volume = {12}, pages = {447-466}, number = {6}, owner = {rebeccaspeckman}, timestamp = {2009.03.03} } @ARTICLE{Xu2005, author = {Xu, R. and Wunsch, D.}, title = {Survey of clustering algorithms}, journal = {Ieee Transactions on Neural Networks}, year = {2005}, volume = {16}, pages = {645-678}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.03.03} } @INPROCEEDINGS{Chiang2004, author = {Chiang, C. S. and Chu, S. C. and Hsin, Y. C. and Wang, M. H.}, title = {Genetic distance measure for K-modes algorithm}, booktitle = {International Workshop on Fuzzy Systems and Innovational Computing}, year = {2004}, pages = {33-40}, address = {Kitalyushu, JAPAN}, owner = {rebeccaspeckman}, timestamp = {2009.03.03} } @INPROCEEDINGS{Cerioli2006, author = {Cerioli, A. and Riani, M. and Atkinson, A. C.}, title = {Robust classification with categorical variables}, booktitle = {17th Symposium on Computational Statistics (COMSTAT 2006)}, year = {2006}, editor = {Rizzi, A. and Vichi, M.}, pages = {507-519}, address = {Rome, ITALY}, owner = {rebeccaspeckman}, timestamp = {2009.03.03} } @ARTICLE{Vinod1969, author = {Vinod, H. D.}, title = {INTEGER PROGRAMMING AND THEORY OF GROUPING}, journal = {Journal of the American Statistical Association}, year = {1969}, volume = {64}, pages = {506-\&}, number = {326}, owner = {rebeccaspeckman}, timestamp = {2009.03.03} } @ARTICLE{Massart1983, author = {Massart, D. L. and Plastria, F. and Kaufman, L.}, title = {NON-HIERARCHICAL CLUSTERING WITH MASLOC}, journal = {Pattern Recognition}, year = {1983}, volume = {16}, pages = {507-516}, number = {5}, owner = {rebeccaspeckman}, timestamp = {2009.03.03} } @ARTICLE{Abaya1984, author = {Abaya, E. F. and Wise, G. L.}, title = {CONVERGENCE OF VECTOR QUANTIZERS WITH APPLICATIONS TO OPTIMAL QUANTIZATION}, journal = {Siam Journal on Applied Mathematics}, year = {1984}, volume = {44}, pages = {183-189}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.03.03} } @ARTICLE{Chaturvedi2001, author = {Chaturvedi, A. and Green, P. E. and Carroll, J. D.}, title = {K-modes clustering}, journal = {Journal of Classification}, year = {2001}, volume = {18}, pages = {35-55}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.03.03} } @ARTICLE{Huang2003, author = {Huang, Z. X. and Ng, M. K.}, title = {A note on $K$-modes clustering}, journal = {Journal of Classification}, year = {2003}, volume = {20}, pages = {257-261}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.03.03} } @INPROCEEDINGS{Couto2005, author = {Couto, J.}, title = {Kernel K-means for categorical data}, booktitle = {6th International Symposium on Intelligent Data Analysis}, year = {2005}, editor = {Famili, A. F. and Kok, J. N. and Pena, J. M. and Siebes, A. and Feelders, A.}, pages = {46-56}, address = {Madrid, SPAIN}, owner = {rebeccaspeckman}, timestamp = {2009.03.03} } @ARTICLE{Estivill-Castro2004, author = {Estivill-Castro, V. and Yang, J.}, title = {Fast and robust general purpose clustering algorithms}, journal = {Data Mining and Knowledge Discovery}, year = {2004}, volume = {8}, pages = {127-150}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.03.03} } @INCOLLECTION{Bock2007, author = {Bock, Hans-Hermann}, title = {A {H}istory of $k$-{M}eans {A}lgorithms}, booktitle = {Selected Contributions in Data Analysis and Classification}, publisher = {Springer Berlin Heidelberg}, year = {2007}, series = {Studies in Classification, Data Analysis, and Knowledge Organization}, pages = {161--172}, owner = {rebeccaspeckman}, timestamp = {2009.03.04} } @BOOK{Xu2009, title = {Clustering}, publisher = {IEEE Press}, year = {2009}, author = {Xu, Rui and Wunsch, Donald C.}, owner = {rebeccaspeckman}, timestamp = {2009.03.04} } @BOOK{Spath1980, title = {Cluster analysis algorithms for data reduction and classification of objects}, publisher = {E. Horwood ; Halsted Press}, year = {1980}, author = {Sp\"{a}th, Helmuth}, series = {Computers and their applications}, address = {Chichester, Eng. New York}, keywords = {Cluster analysis Data processing. FORTRAN (Computer program language)}, owner = {rebeccaspeckman}, timestamp = {2009.03.04} } @BOOK{Kogan2007, title = {Introduction to clustering large and high-dimensional data}, publisher = {Cambridge University Press}, year = {2007}, author = {Kogan, Jacob}, address = {Cambridge ; New York}, keywords = {Cluster analysis Data processing. Cluster analysis Computer programs. Computer algorithms. Dimensional analysis Data processing. Dimensional analysis Computer programs.}, owner = {rebeccaspeckman}, timestamp = {2009.03.04} } @BOOK{Gan2007, title = {Data clustering : theory, algorithms, and applications}, publisher = {SIAM American Statistical Association}, year = {2007}, author = {Gan, Guojun and Ma, Chaoqun and Wu, Jianhong}, address = {Philadelphia, Pa. Alexandria, Va.}, keywords = {Cluster analysis. Cluster analysis Data processing.}, owner = {rebeccaspeckman}, timestamp = {2009.03.04} } @BOOK{Anderberg1973, title = {Cluster analysis for applications}, publisher = {Academic Press}, year = {1973}, author = {Anderberg, Michael R.}, address = {New York,}, keywords = {Cluster analysis.}, owner = {rebeccaspeckman}, timestamp = {2009.03.04} } @BOOK{Spath1985, title = {Cluster dissection and analysis : {T}heory, FORTRAN programs, examples}, publisher = {Horwood ; Halsted Press [distributor]}, year = {1985}, author = {Sp\"{a}th, Helmuth}, address = {Chichester New York}, keywords = {Cluster analysis Data processing.}, owner = {rebeccaspeckman}, timestamp = {2009.03.04} } @INPROCEEDINGS{Ordonez2003, author = {Ordonez, Carlos}, title = {Clustering binary data streams with K-means}, booktitle = {DMKD '03: Proceedings of the 8th ACM SIGMOD workshop on Research issues in data mining and knowledge discovery}, year = {2003}, pages = {12--19}, address = {New York, NY, USA}, publisher = {ACM}, location = {San Diego, California}, owner = {rebeccaspeckman}, timestamp = {2009.03.04} } @ARTICLE{Janowitz1979, author = {Janowitz, M. F.}, title = {MONOTONE EQUIVARIANT CLUSTER METHODS}, journal = {Siam Journal on Applied Mathematics}, year = {1979}, volume = {37}, pages = {148-165}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.03.06} } @ARTICLE{Janowitz1979a, author = {Janowitz, M. F.}, title = {PRESERVATION OF GLOBAL ORDER EQUIVALENCE}, journal = {Journal of Mathematical Psychology}, year = {1979}, volume = {20}, pages = {78-88}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.03.06} } @ARTICLE{Janowitz1993, author = {Janowitz, M. F. and Stinebrickner, R.}, title = {PRESERVATION OF WEAK ORDER EQUIVALENCE}, journal = {Mathematical Social Sciences}, year = {1993}, volume = {25}, pages = {181-197}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.03.06} } @ARTICLE{Emrich1991, author = {Emrich, L. J. and Piedmonte, M. R.}, title = {A method for generating high-dimensional multivariate binary variates}, journal = {American Statistician}, year = {1991}, volume = {45}, pages = {302-304}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.03.06} } @INCOLLECTION{Kaufman1987, author = {Kaufman, Leonard and Rousseeuw, Peter J.}, title = {Clustering by means of medoids.}, booktitle = {Statistical data analysis based on the L b1 s-norm and related methods}, publisher = {North-Holland ; Sole distributors for the U.S.A. and Canada, Elsevier Science Pub. Co.}, year = {1987}, editor = {Dodge, Yadolah}, chapter = {Clustering by means of medoids}, pages = {405--416}, address = {Amsterdam ; New York New York, N.Y., U.S.A.}, keywords = {Least absolute deviations (Statistics) Congresses. Mathematical statistics Congresses.}, owner = {rebeccaspeckman}, timestamp = {2009.03.07} } @BOOK{Kaufman2005, title = {Finding groups in data : an introduction to cluster analysis}, publisher = {Wiley}, year = {2005}, author = {Kaufman, Leonard and Rousseeuw, Peter J.}, series = {Wiley series in probability and mathematical statistics}, address = {Hoboken, N.J.}, keywords = {Cluster analysis.}, owner = {rebeccaspeckman}, timestamp = {2009.03.07} } @BOOK{Kaufman1990, title = {Finding groups in data : an introduction to cluster analysis}, publisher = {Wiley}, year = {1990}, author = {Kaufman, Leonard and Rousseeuw, Peter J.}, series = {Wiley series in probability and mathematical statistics. Applied probability and statistics,}, address = {New York}, keywords = {Cluster analysis.}, owner = {rebeccaspeckman}, timestamp = {2009.03.07} } @ARTICLE{Ng2002, author = {Ng, R. T. and Han, J. W.}, title = {CLARANS: A method for clustering objects for spatial data mining}, journal = {Ieee Transactions on Knowledge and Data Engineering}, year = {2002}, volume = {14}, pages = {1003-1016}, number = {5}, owner = {rebeccaspeckman}, timestamp = {2009.03.07} } @ARTICLE{VanderLaan2003, author = {Van der Laan, M. J. and Pollard, K. S. and Bryan, J.}, title = {A new partitioning around medoids algorithm}, journal = {Journal of Statistical Computation and Simulation}, year = {2003}, volume = {73}, pages = {575-584}, number = {8}, owner = {rebeccaspeckman}, timestamp = {2009.03.07} } @ARTICLE{Park2009, author = {Park, H. S. and Jun, C. H.}, title = {A simple and fast algorithm for K-medoids clustering}, journal = {Expert Systems with Applications}, year = {2009}, volume = {36}, pages = {3336-3341}, number = {2}, note = {Part 2}, owner = {rebeccaspeckman}, timestamp = {2009.03.07} } @ARTICLE{Selim1984, author = {Selim, S. Z. and Ismail, M. A.}, title = {{$K$}-means-type algorithms: a generalized convergence theorem and characterization of local optimality}, journal = {{IEEE} Transactions on Pattern Analysis and Machine Intelligence}, year = {1984}, volume = {6}, pages = {81-87}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.03.07} } @ARTICLE{Bobrowski1991, author = {Bobrowski, L. and Bezdek, J. C.}, title = {C-MEANS CLUSTERING WITH THE L1 AND L-INFINITY NORMS}, journal = {Ieee Transactions on Systems Man and Cybernetics}, year = {1991}, volume = {21}, pages = {545-554}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.03.07} } @INPROCEEDINGS{Leeser2000, author = {Leeser, M. and Theiler, J. and Estlick, M. and Szymanski, J. J.}, title = {Design tradeoffs in a hardware implementation of the K-means clustering algorithm}, booktitle = {1st IEEE Sensor Array and Multichannel Signal Processing Workshop (SAM 2000)}, year = {2000}, pages = {520-524}, address = {Cambridge, Ma}, owner = {rebeccaspeckman}, timestamp = {2009.03.07} } @ARTICLE{Spath1976, author = {Spath, H.}, title = {ALGORITHM .30. L-1 CLUSTER-ANALYSIS}, journal = {Computing}, year = {1976}, volume = {16}, pages = {379-387}, number = {4}, note = {ISI Document Delivery No.: BT480 Times Cited: 1 Cited Reference Count: 3}, owner = {rebeccaspeckman}, timestamp = {2009.03.08} } @ARTICLE{Hamming1950, author = {Hamming, R. W.}, title = {ERROR DETECTING AND ERROR CORRECTING CODES}, journal = {Bell System Technical Journal}, year = {1950}, volume = {29}, pages = {147-160}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.03.08} } @ARTICLE{Jancey1966, author = {Jancey, R. C.}, title = {MULTIDIMENSIONAL GROUP ANALYSIS}, journal = {Australian Journal of Botany}, year = {1966}, volume = {14}, pages = {127-\&}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.03.08} } @ARTICLE{Ralambondrainy1995, author = {Ralambondrainy, H.}, title = {A CONCEPTUAL VERSION OF THE K-MEANS ALGORITHM}, journal = {Pattern Recognition Letters}, year = {1995}, volume = {16}, pages = {1147-1157}, number = {11}, owner = {rebeccaspeckman}, timestamp = {2009.03.08} } @ARTICLE{Davies1979, author = {Davies, D. L. and Bouldin, D. W.}, title = {A cluster separation measure}, journal = {Ieee Transactions on Pattern Analysis and Machine Intelligence}, year = {1979}, volume = {1}, pages = {224-227}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.03.13} } @INPROCEEDINGS{Halkidi2001, author = {Halkidi, M. and Batistakis, Y. and Vazirgiannis, M.}, title = {On clustering validation techniques}, booktitle = {13th International Conference on Scientific and Statistical Database Management (SSDBM 2001)}, year = {2001}, pages = {107-145}, address = {Fairfax, Virginia}, owner = {rebeccaspeckman}, timestamp = {2009.03.13} } @ARTICLE{Halkidi2002, author = {Halkidi, M. and Batistakis, Y. and Vazirgiannis, M.}, title = {Cluster validity methods: Part I}, journal = {Sigmod Record}, year = {2002}, volume = {31}, pages = {40-45}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.03.13} } @ARTICLE{Ratkowsky1978, author = {Ratkowsky, D. A. and Lance, G. N.}, title = {A criterion for determining the number of groups in a classification}, journal = {Australian Computer Journal}, year = {1978}, volume = {10}, pages = {115-117}, owner = {rebeccaspeckman}, timestamp = {2009.03.13} } @ARTICLE{Hill1980, author = {Hill, R. S.}, title = {A STOPPING RULE FOR PARTITIONING DENDROGRAMS}, journal = {Botanical Gazette}, year = {1980}, volume = {141}, pages = {321-324}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.03.13} } @INPROCEEDINGS{Xu1997, author = {Xu, L.}, title = {Bayesian Ying-Yang machine, clustering and number of clusters}, booktitle = {Pattern Recognition in Practice V Conference}, year = {1997}, pages = {1167-1178}, address = {Vlieland, Netherlands}, owner = {rebeccaspeckman}, timestamp = {2009.03.13} } @ARTICLE{Carmichael1968, author = {Carmichael, J. W. and George, J. Alan and Julius, R. S.}, title = {Finding Natural Clusters}, journal = {Systematic Zoology}, year = {1968}, volume = {17}, pages = {144--150}, number = {2}, abstract = {On being presented with a set of points plotted on a plane space, most observers will agree that the set contains clusters of points if the distribution of the points meets certain conditions. We postulate that these conditions are: (1) that there are continuous, relatively densely populated regions of the space, and (2) that these are surrounded by continuous, relatively empty regions of the space. Using this definition for natural clusters we developed an algorithm for finding such clusters from the relative interpoint distances within a set. Starting with the closest pair of points, additional cluster members are sought by a minimum stepping stone strategy and accepted or rejected on the basis of parameters which indicate a discontinuity of closeness (i.e. a continuous, relatively empty space surrounding the points already admitted to the cluster) The procedure is repeated at a number of levels of resolution to give an overall view of the population. The resolution levels appropriate for a given set of points are determined from an analysis of the interpoint distances. Since the procedure uses only the relative distances between the points, it can be applied to points specified in any number of dimensions.}, jstor_formatteddate = {Jun., 1968}, owner = {rebeccaspeckman}, publisher = {Taylor \& Francis, Ltd. for the Society of Systematic Biologists}, timestamp = {2009.03.13} } @ARTICLE{Carmichael1969, author = {Carmichael, J. W. and Sneath, P. H. A.}, title = {Taxometric Maps}, journal = {Systematic Zoology}, year = {1969}, volume = {18}, pages = {402--415}, number = {4}, abstract = {An n x t table, recording the results of n different tests or observations on each of t items, contains information which may be used to classify the items (OTU's) according to their similarities and differences. We equate similarity with relative closeness, and difference with relative distance. Relative closeness and relative distance may be thought of as complementary measures of proximity. The process of converting an n \times t table of test results into a t \times t table of proximities between the pairs of items is called proximity analysis. The process of finding any isolated subsets of similar items is called cluster analysis. Taxometric methods are viewed as approaches to compressing the information in an n \times t table into a graphic, best-fit model for the display of proximity and cluster relations among OTU's. Three general approaches are followed. 1) compression before proximity analysis, 2) compression as an intermediate step between proximity and cluster analysis, and 3) compression after cluster analysis. The utility of these approaches is compared. Taxometric maps are introduced as a means to display proximity and cluster relations. A procedure is given for preparing taxometric maps from cluster analysis results, and two examples are presented for illustration. An appendix on similarity and proximity equates three independently derived similarity coefficients, which are shown to be derivable from a city-block metric proximity measure.}, jstor_formatteddate = {Dec., 1969}, owner = {rebeccaspeckman}, timestamp = {2009.03.13} } @ARTICLE{Jain1999, author = {Jain, A. K. and Murty, M. N. and Flynn, P. J.}, title = {Data clustering: A review}, journal = {Acm Computing Surveys}, year = {1999}, volume = {31}, pages = {264-323}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.03.16} } @ARTICLE{Grabmeier2002, author = {Grabmeier, J. and Rudolph, A.}, title = {Techniques of cluster algorithms in data mining}, journal = {Data Mining and Knowledge Discovery}, year = {2002}, volume = {6}, pages = {303-360}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.03.16} } @ARTICLE{Gyllenberg1997, author = {Gyllenberg, M. and Koski, T. and Verlaan, M.}, title = {Classification of binary vectors by stochastic complexity}, journal = {Journal of Multivariate Analysis}, year = {1997}, volume = {63}, pages = {47-72}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.03.16} } @ARTICLE{Lee1997, author = {Lee, A. J.}, title = {Some simple methods for generating correlated categorical variates}, journal = {Computational Statistics \& Data Analysis}, year = {1997}, volume = {26}, pages = {133-148}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.03.16} } @ARTICLE{Franti2000, author = {Franti, P. and Gyllenberg, H. G. and Gyllenberg, M. and Kivijarvi, J. and Koski, T. and Lund, T. and Nevalainen, O.}, title = {Minimizing stochastic complexity using local search and GLA with applications to classification of bacteria}, journal = {Biosystems}, year = {2000}, volume = {57}, pages = {37-48}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.03.16} } @ARTICLE{Kang2001, author = {Kang, S. H. and Jung, S. H.}, title = {Generating correlated binary variables with complete specification of the joint distribution}, journal = {Biometrical Journal}, year = {2001}, volume = {43}, pages = {263-269}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.03.16} } @ARTICLE{Al-Osh2001, author = {Al-Osh, M. A. and Lee, S. J.}, title = {A simple approach for generating correlated binary variates}, journal = {Journal of Statistical Computation and Simulation}, year = {2001}, volume = {70}, pages = {231-255}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.03.16} } @ARTICLE{Qaqish2003, author = {Qaqish, B. F.}, title = {A family of multivariate binary distributions for simulating correlated binary variables with specified marginal means and correlations}, journal = {Biometrika}, year = {2003}, volume = {90}, pages = {455-463}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.03.16} } @ARTICLE{Demirtas2007, author = {Demirtas, H.}, title = {The design of simulation studies in medical statistics}, journal = {Statistics in Medicine}, year = {2007}, volume = {26}, pages = {3818-3821}, number = {20}, owner = {rebeccaspeckman}, timestamp = {2009.03.16} } @ARTICLE{Demirtas2006, author = {Demirtas, H.}, title = {A method for multivariate ordinal data generation given marginal distributions and correlations}, journal = {Journal of Statistical Computation and Simulation}, year = {2006}, volume = {76}, pages = {1017-1025}, number = {11}, owner = {rebeccaspeckman}, timestamp = {2009.03.16} } @ARTICLE{Farrell2008, author = {Farrell, P. J. and Rogers-Stewart, K.}, title = {Methods for generating longitudinally correlated binary data}, journal = {International Statistical Review}, year = {2008}, volume = {76}, pages = {28-38}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.03.16} } @ARTICLE{Lee1993, author = {Lee, A. J.}, title = {GENERATING RANDOM BINARY DEVIATES HAVING FIXED MARGINAL DISTRIBUTIONS AND SPECIFIED DEGREES OF ASSOCIATION}, journal = {American Statistician}, year = {1993}, volume = {47}, pages = {209-215}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.03.16} } @ARTICLE{Goodman1974a, author = {Goodman, L. A.}, title = {EXPLORATORY LATENT STRUCTURE-ANALYSIS USING BOTH IDENTIFIABLE AND UNIDENTIFIABLE MODELS}, journal = {Biometrika}, year = {1974}, volume = {61}, pages = {215-231}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.03.16} } @INPROCEEDINGS{Birkelund1995, author = {Birkelund, G. E. and Goodman, L. A. and Rose, D.}, title = {The latent structure of job characteristics of men and women}, booktitle = {Biannual Meeting of the Research-Committee-on-Social-Stratification, International-Sociological-Association}, year = {1995}, pages = {80-113}, address = {Zurich, Switzerland}, owner = {rebeccaspeckman}, timestamp = {2009.03.16} } @INPROCEEDINGS{Burton2005, author = {Burton, A. and Altman, D. G. and Royston, P. and Holder, R. L.}, title = {The design of simulation studies in medical statistics}, booktitle = {26th Annual Meeting of the International-Society-for-Clinical-Biostatistics (ISCB26)}, year = {2005}, pages = {4279-4292}, address = {Szeged, HUNGARY}, abstract = {Simulation studies use computer intensive procedures to assess the performance of a variety of statistical methods in relation to a known truth. Such evaluation cannot be achieved with studies of real data alone. Designing high-quality simulations that reflect the complex situations seen in practice, such as in prognostic factors studies, is not a simple process. Unfortunately, very few published simulation studies provide sufficient details to allow readers to understand fully all the processes required to design a simulation study. When planning a simulation study, it is recommended that a detailed protocol be produced, giving full details of how the study will be performed, analysed and reported. This paper details the important considerations necessary when designing any simulation study, including defining specific objectives of the study, determining the procedures for generating the data sets and the number of simulations to perform. A checklist highlighting the important considerations when designing a simulation study is provided. A small review of the literature identifies the current practices within published simulation studies. Copyright (c) 2006 John Wiley & Sons, Ltd.}, owner = {rebeccaspeckman}, timestamp = {2009.03.16} } @ARTICLE{Rousseeuw1987, author = {Rousseeuw, P. J.}, title = {SILHOUETTES - A GRAPHICAL AID TO THE INTERPRETATION AND VALIDATION OF CLUSTER-ANALYSIS}, journal = {Journal of Computational and Applied Mathematics}, year = {1987}, volume = {20}, pages = {53-65}, owner = {rebeccaspeckman}, timestamp = {2009.03.17} } @INPROCEEDINGS{Dudoit2001, author = {Dudoit, S. and Fridlyand, J.}, title = {Bagging to improve the accuracy of a clustering procedure}, booktitle = {NIPS Workshop on Machine Learning Techniques for Bioinformatics}, year = {2001}, pages = {1090-1099}, address = {Whistler, Canada}, owner = {rebeccaspeckman}, timestamp = {2009.03.17} } @ARTICLE{Gower1971, author = {Gower, J. C.}, title = {GENERAL COEFFICIENT OF SIMILARITY AND SOME OF ITS PROPERTIES}, journal = {Biometrics}, year = {1971}, volume = {27}, pages = {857-\&}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.03.17} } @ARTICLE{Gower1986, author = {Gower, J. C. and Legendre, P.}, title = {METRIC AND EUCLIDEAN PROPERTIES OF DISSIMILARITY COEFFICIENTS}, journal = {Journal of Classification}, year = {1986}, volume = {3}, pages = {5-48}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.03.17} } @INPROCEEDINGS{BenHur2002, author = {Ben-Hur}, title = {A stability based method for discovering structure in clustered dataset}, year = {2002}, owner = {rebeccaspeckman}, timestamp = {2009.03.17} } @TECHREPORT{Fridlyand2001, author = {Fridlyand, J. and Dudoit, S.}, title = {Applications of resampling methods to estimate the number of clusters and to improve the accuracy of a clustering method}, year = {2001}, owner = {rebeccaspeckman}, timestamp = {2009.03.17} } @ARTICLE{Breckenridge1989, author = {Breckenridge, J. N.}, title = {REPLICATING CLUSTER-ANALYSIS - METHOD, CONSISTENCY, AND VALIDITY}, journal = {Multivariate Behavioral Research}, year = {1989}, volume = {24}, pages = {147-161}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.03.17} } @ARTICLE{Breckenridge2000, author = {Breckenridge, J. N.}, title = {Validating cluster analysis: Consistent replication and symmetry}, journal = {Multivariate Behavioral Research}, year = {2000}, volume = {35}, pages = {261-285}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.03.17} } @ARTICLE{Bittner2000, author = {Bittner, M. and Meitzer, P. and Chen, Y. and Jiang, Y. and Seftor, E. and Hendrix, M. and Radmacher, M. and Simon, R. and Yakhini, Z. and Ben-Dor, A. and Sampas, N. and Dougherty, E. and Wang, E. and Marincola, F. and Gooden, C. and Lueders, J. and Glatfelter, A. and Pollock, P. and Carpten, J. and Gillanders, E. and Leja, D. and Dietrich, K. and Beaudry, C. and Berens, M. and Alberts, D. and Sondak, V. and Hayward, N. and Trent, J.}, title = {Molecular classification of cutaneous malignant melanoma by gene expression profiling}, journal = {Nature}, year = {2000}, volume = {406}, pages = {536-540}, number = {6795}, owner = {rebeccaspeckman}, timestamp = {2009.03.17} } @ARTICLE{Kerr2001, author = {Kerr, M. K. and Churchill, G. A.}, title = {Bootstrapping cluster analysis: Assessing the reliability of conclusions from microarray experiments}, journal = {Proceedings of the National Academy of Sciences of the United States of America}, year = {2001}, volume = {98}, pages = {8961-8965}, number = {16}, owner = {rebeccaspeckman}, timestamp = {2009.03.17} } @ARTICLE{Levine2001, author = {Levine, E. and Domany, E.}, title = {Resampling method for unsupervised estimation of cluster validity}, journal = {Neural Computation}, year = {2001}, volume = {13}, pages = {2573-2593}, number = {11}, owner = {rebeccaspeckman}, timestamp = {2009.03.17} } @ARTICLE{Kell2004, author = {Kell, D. B. and Oliver, S. G.}, title = {Here is the evidence, now what is the hypothesis? The complementary roles of inductive and hypothesis-driven science in the post-genomic era}, journal = {Bioessays}, year = {2004}, volume = {26}, pages = {99-105}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.03.17} } @ARTICLE{Lange2004, author = {Lange, T. and Roth, V. and Braun, M. L. and Buhmann, J. M.}, title = {Stability-based validation of clustering solutions}, journal = {Neural Computation}, year = {2004}, volume = {16}, pages = {1299-1323}, number = {6}, owner = {rebeccaspeckman}, timestamp = {2009.03.17} } @ELECTRONIC{UCIdatabase, author = {Hettich, S. and Bay, S. D.}, year = {1999}, title = {The UCI KDD Archive}, organization = {Irvine, CA: University of California, Department of Information and Computer Science.}, url = {http://kdd.ics.uci.edu/}, owner = {rebeccaspeckman}, timestamp = {2009.03.20} } @ELECTRONIC{UCI-ML, author = {A. Asuncion and D.J. Newman}, year = {2007}, title = {{UCI} Machine Learning Repository}, organization = {University of California, School of Information and Computer Science}, url = {http://www.ics.uci.edu/$\sim$mlearn/{MLR}epository.html}, owner = {rebeccaspeckman}, timestamp = {2009.03.20} } @ARTICLE{Hodson1970, author = {Hodson, F. R.}, title = {CLUSTER ANALYSIS AND ARCHAEOLOGY - SOME NEW DEVELOPMENTS AND APPLICATIONS}, journal = {World Archaeology}, year = {1970}, volume = {1}, pages = {299-320}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.03.28} } @ARTICLE{Blashfield1979, author = {Blashfield, R. K. and Morey, L. C.}, title = {CLASSIFICATION OF DEPRESSION THROUGH CLUSTER-ANALYSIS}, journal = {Comprehensive Psychiatry}, year = {1979}, volume = {20}, pages = {516-527}, number = {6}, owner = {rebeccaspeckman}, timestamp = {2009.03.28} } @ARTICLE{Everitt2004, author = {Everitt, B.}, title = {Untitled}, journal = {Statistical Methods in Medical Research}, year = {2004}, volume = {13}, pages = {343-345}, number = {5}, owner = {rebeccaspeckman}, timestamp = {2009.03.28} } @ARTICLE{Gallegos2005, author = {Gallegos, M. T. and Ritter, G.}, title = {A robust method for cluster analysis}, journal = {Annals of Statistics}, year = {2005}, volume = {33}, pages = {347-380}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.03.28} } @INPROCEEDINGS{Shim2005, author = {Shim, Y. and Chung, J. and Choi, I. C.}, title = {A comparison study of cluster validity indices using a nonhierarchical clustering algorithm}, booktitle = {International Conference on Computational Intelligence for Modelling, Control and Automation/International Conference on Intelligent Agents Web Technologies and International Commerce}, year = {2005}, editor = {Mohammadian, M.}, pages = {199-203}, address = {Vienna, AUSTRIA}, owner = {rebeccaspeckman}, timestamp = {2009.03.28} } @ARTICLE{Mao1996, author = {Mao, J. C. and Jain, A. K.}, title = {A self-organizing network for hyperellipsoidal clustering (HEC)}, journal = {Ieee Transactions on Neural Networks}, year = {1996}, volume = {7}, pages = {16-29}, number = {1}, abstract = {We propose a self-organizing network for hyperellipsoidal clustering (HEC), The HEC network consists of two layers, The first layer employs a number of principal component analysis subnetworks which are used to estimate the hyperellipsoidal shapes of currently formed clusters, The second layer then performs a competitive learning using the cluster shape information provided by the first layer. The HEC network performs a partitional clustering using the proposed regularized Mahalanobis distance. This regularized Mahalanobis distance is designed to deal with the problems in estimating the Mahalanobis distance when the number of patterns in a cluster is less than (ill-posed problem) or not considerably larger than (poorly posed problem) the dimensionality of the feature space during the clustering procedure. This regularized distance also achieves a tradeoff between hyperspherical and hyperellipsoidal cluster shapes so as to prevent the HEC network from producing usually large or unusually small clusters, The significance level of the Kolmogorov-Smirnov test on the distribution of the Mahalanobis distances of patterns in a cluster to the cluster center under the Gaussian cluster assumption is used as a compactness measure of the cluster, The HEC network has been tested on a number of artificial data sets and real data sets, We also apply the HEC network to texture segmentation problems. Experiments show that the HEC network leads to a significant improvement in the clustering results over the IC-means algorithm with Euclidean distance. Our results on real data sets also indicate that hyperellipsoidal shaped clusters are often encountered in practice.}, owner = {rebeccaspeckman}, timestamp = {2009.03.28} } @ARTICLE{Mao1997, author = {Mao, J. C. and Jain, A.}, title = {A self-organizing network for hyperellipsoidal clustering (HEC) - Reply}, journal = {Ieee Transactions on Neural Networks}, year = {1997}, volume = {8}, pages = {1563-1563}, number = {6}, owner = {rebeccaspeckman}, timestamp = {2009.03.28} } @ARTICLE{Vlies2009, author = {van der Vlies, A. E. and Verwey, N. A. and Bouwman, F. H. and Blankenstein, M. A. and Klein, M. and Scheltens, P. and van der Flier, W. M.}, title = {CSF biomarkers in relationship to cognitive profiles in Alzheimer disease}, journal = {Neurology}, year = {2009}, volume = {72}, pages = {1056-1061}, number = {12}, owner = {rebeccaspeckman}, timestamp = {2009.04.05} } @ARTICLE{Sellick1990, author = {Sellick, K. and Littlejohn, G. and Wallace, C. and Over, R.}, title = {IDENTIFYING SUBCLASSES OF PATIENTS WITH RHEUMATOID-ARTHRITIS THROUGH CLUSTER-ANALYSIS}, journal = {Journal of Rheumatology}, year = {1990}, volume = {17}, pages = {1613-1619}, number = {12}, owner = {rebeccaspeckman}, timestamp = {2009.04.05} } @ARTICLE{Chaturvedi1997, author = {Chaturvedi, A. and Carroll, J. D. and Green, P. E. and Rotondo, J. A.}, title = {A feature-based approach to market segmentation via overlapping K-centroids clustering}, journal = {Journal of Marketing Research}, year = {1997}, volume = {34}, pages = {370-377}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.04.05} } @ARTICLE{Koo2001, author = {Koo, E. and Nagy, Z. and Sesztak, M. and Ujfalussy, I. and Meretey, K. and Bohm, U. and Forgacs, S. and Szilagy, M. and Czirjak, L. and Farkas, V.}, title = {Subsets in psoriatic arthritis formed by cluster analysis}, journal = {Clinical Rheumatology}, year = {2001}, volume = {20}, pages = {36-43}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.04.05} } @ARTICLE{Diallo-Danebrock2007, author = {Diallo-Danebrock, R. and Ting, E. and Gluz, O. and Herr, A. and Mohrmann, S. and Geddert, H. and Rody, A. and Schaefer, K. L. and Baldus, S. E. and Hartmann, A. and Wild, P. J. and Burson, M. and Gabbert, H. E. and Nitz, U. and Poremba, C.}, title = {Protein expression profiling in high-risk breast cancer patients treated with high-dose or conventional dose-dense chemotherapy}, journal = {Clinical Cancer Research}, year = {2007}, volume = {13}, pages = {488-497}, number = {2}, note = {Part 1}, owner = {rebeccaspeckman}, timestamp = {2009.04.05} } @ARTICLE{Brusco2008, author = {Brusco, M. J. and Kohn, H. F.}, title = {Optimal partitioning of a data set based on the p-median model}, journal = {Psychometrika}, year = {2008}, volume = {73}, pages = {89-105}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.04.05} } @ARTICLE{Steinley2008b, author = {Steinley, D.}, title = {Stability analysis in {$K$}-means clustering}, journal = {British Journal of Mathematical \& Statistical Psychology}, year = {2008}, volume = {61}, pages = {255-273}, note = {Part 2}, owner = {rebeccaspeckman}, timestamp = {2009.04.05} } @ARTICLE{Grun2009, author = {Grun, B. and Leisch, F.}, title = {Dealing with label switching in mixture models under genuine multimodality}, journal = {Journal of Multivariate Analysis}, year = {2009}, volume = {100}, pages = {851-861}, number = {5}, owner = {rebeccaspeckman}, timestamp = {2009.04.05} } @ARTICLE{Ubeyli, author = {\"{U}beyli, Elif and Do\u{g}du, Erdo\u{g}an}, title = {Automatic Detection of Erythemato-Squamous Diseases Using k -Means Clustering}, journal = {Journal of Medical Systems}, year = {2009}, note = {I can't find it in the list view of the journal (by volume, date), but can find by searching. Possibly is up early.}, abstract = {A new approach based on the implementation of k-means clustering is presented for automated detection of erythemato-squamous diseases. The purpose of clustering techniques is to find a structure for the given data by finding similarities between data according to data characteristics. The studied domain contained records of patients with known diagnosis. The k-means clustering algorithm’s task was to classify the data points, in this case the patients with attribute data, to one of the five clusters. The algorithm was used to detect the five erythemato-squamous diseases when 33 features defining five disease indications were used. The purpose is to determine an optimum classification scheme for this problem. The present research demonstrated that the features well represent the erythemato-squamous diseases and the k-means clustering algorithm’s task achieved high classification accuracies for only five erythemato-squamous diseases.}, citeulike-article-id = {3619137}, doi = {10.1007/s10916-008-9229-6}, owner = {rebeccaspeckman}, posted-at = {2008-11-20 20:23:48}, timestamp = {2009.04.09} } @ARTICLE{Goodman1954, author = {Goodman, L. A. and Kruskal, W. H.}, title = {MEASURES OF ASSOCIATION FOR CROSS CLASSIFICATIONS}, journal = {Journal of the American Statistical Association}, year = {1954}, volume = {49}, pages = {732-764}, number = {268}, owner = {rebeccaspeckman}, timestamp = {2009.04.13} } @ARTICLE{Mirkin2001, author = {Mirkin, B.}, title = {Eleven ways to look at the chi-squared coefficient for contingency tables}, journal = {American Statistician}, year = {2001}, volume = {55}, pages = {111-120}, number = {2}, abstract = {This article has been written in recognition of the 100th anniversary of introduction of the concept of association between categorical variables by Yule and Pearson. The most popular among the contingency coefficients, Pearson's chi-squared, estimates the bias of a cross-classification from the statistical independence. Also, it measures association per se between the row and column variables. The purpose of this article is to present a collection of ii definitions for the chi-squared coefficient related to either of these goals. One of the quoted definitions of the chi-squared coefficient seems especially appealing as an association measure: the averaged relative Quetelet index of category-to-category associations.}, owner = {rebeccaspeckman}, timestamp = {2009.04.13} } @ARTICLE{Topchy2005, author = {Topchy, A. and Jain, A. K. and Punch, W.}, title = {Clustering ensembles: Models of consensus and weak partitions}, journal = {Ieee Transactions on Pattern Analysis and Machine Intelligence}, year = {2005}, volume = {27}, pages = {1866-1881}, number = {12}, abstract = {Clustering ensembles have emerged as a powerful method for improving both the robustness as well as the stability of unsupervised classification solutions. However, finding a consensus clustering from multiple partitions is a difficult problem that can be approached from graph-based, combinatorial, or statistical perspectives. This study extends previous research on clustering ensembles in several respects. First, we introduce a unified representation for multiple clusterings and formulate the corresponding categorical clustering problem. Second, we propose a probabilistic model of consensus using a finite mixture of multinomial distributions in a space of clusterings. A combined partition is found as a solution to the corresponding maximum-likelihood problem using the EM algorithm. Third, we define a new consensus function that is related to the classical intraclass variance criterion using the generalized mutual information definition. Finally, we demonstrate the efficacy of combining partitions generated by weak clustering algorithms that use data projections and random data splits. A simple explanatory model is offered for the behavior of combinations of such weak clustering components. Combination accuracy is analyzed as a function of several parameters that control the power and resolution of component partitions as well as the number of partitions. We also analyze clustering ensembles with incomplete information and the effect of missing cluster labels on the quality of overall consensus. Experimental results demonstrate the effectiveness of the proposed methods on several real-world data sets.}, owner = {rebeccaspeckman}, timestamp = {2009.04.13} } @ARTICLE{Albatineh2006, author = {Albatineh, A. N. and Niewiadomska-Bugaj, M. and Mihalko, D.}, title = {On similarity indices and correction for chance agreement}, journal = {Journal of Classification}, year = {2006}, volume = {23}, pages = {301-313}, number = {2}, abstract = {Similarity indices can be used to compare partitions (clusterings) of a data set. Many such indices were introduced in the literature over the years. We are showing that out of 28 indices we were able to track, there are 22 different ones. Even though their values differ for the same clusterings compared, after correcting for agreement attributed to chance only, their values become similar and some of them even become equivalent. Consequently, the problem of choice of the index to be used for comparing different clusterings becomes less important.}, owner = {rebeccaspeckman}, timestamp = {2009.04.13} } @ARTICLE{Warrens2008, author = {Warrens, M. J.}, title = {On the indeterminacy of resemblance measures for binary (presence/absence) data}, journal = {Journal of Classification}, year = {2008}, volume = {25}, pages = {125-136}, number = {1}, abstract = {Many similarity coefficients for binary data are defined as fractions. For certain resemblance measures the denominator may become zero. If the denominator is zero the value of the coefficient is indeterminate. It is shown that the seriousness of the indeterminacy problem differs with the resemblance measures. Following Batagelj and Bren (1995) we remove the indeterminacies by defining appropriate values in critical cases.}, owner = {rebeccaspeckman}, timestamp = {2009.04.13} } @ARTICLE{Warrens2008a, author = {Warrens, M. J.}, title = {On similarity coefficients for 2 x 2 tables and correction for chance}, journal = {Psychometrika}, year = {2008}, volume = {73}, pages = {487-502}, number = {3}, abstract = {This paper studies correction for chance in coefficients that are linear functions of the observed proportion of agreement. The paper unifies and extends various results on correction for chance in the literature. A specific class of coefficients is used to illustrate the results derived in this paper. Coefficients in this class, e.g. the simple matching coefficient and the Dice/Sorenson coefficient, become equivalent after correction for chance, irrespective of what expectation is used. The coefficients become either Cohen's kappa, Scott's pi, Mak's rho, Goodman and Kruskal's lambda, or Hamann's eta, depending on what expectation is considered appropriate. Both a multicategorical generalization and a multivariate generalization are discussed.}, owner = {rebeccaspeckman}, timestamp = {2009.04.13} } @ARTICLE{Warrens2008b, author = {Warrens, M.}, title = {On Association Coefficients for 2x2 Tables and Properties That Do Not Depend on the Marginal Distributions}, journal = {Psychometrika}, year = {2008}, volume = {73}, pages = {777-789}, number = {4}, abstract = {We discuss properties that association coefficients may have in general, e.g., zero value under statistical independence, and we examine coefficients for 2x2 tables with respect to these properties. Furthermore, we study a family of coefficients that are linear transformations of the observed proportion of agreement given the marginal probabilities. This family includes the phi coefficient and Cohen's kappa. The main result is that the linear transformations that set the value under independence at zero and the maximum value at unity, transform all coefficients in this family into the same underlying coefficient. This coefficient happens to be Loevinger's H.}, owner = {rebeccaspeckman}, timestamp = {2009.04.13} } @ARTICLE{Warrens2008c, author = {Warrens, M. J.}, title = {Bounds of Resemblance Measures for Binary (Presence/Absence) Variables}, journal = {Journal of Classification}, year = {2008}, volume = {25}, pages = {195-208}, number = {2}, abstract = {Bounds of association coefficients for binary variables are derived using the arithmetic-geometric-harmonic mean inequality. More precisely, it is shown which presence/absence coefficients are bounds with respect to each other. Using the new bounds it is investigated whether a coefficient is in general closer to either its upper or its lower bound.}, owner = {rebeccaspeckman}, timestamp = {2009.04.13} } @ARTICLE{Wu2009, author = {Wu, J. J. and Chen, J. and Xiong, H. and Xie, M.}, title = {External validation measures for K-means clustering: A data distribution perspective}, journal = {Expert Systems with Applications}, year = {2009}, volume = {36}, pages = {6050-6061}, number = {3}, note = {Part 2}, abstract = {Cluster validation is an important part of any cluster analysis. External measures such as entropy, purity and mutual information are often used to evaluate K-means clustering. However, whether these measures are indeed suitable for K-means clustering remains unknown. Along this line, in this paper, we show that a data distribution view is of great use to selecting the right measures for K-means clustering. Specifically, we first introduce the data distribution view of K-means, and the resultant uniform effect on highly imbalanced data sets. Eight external measures widely used in recent data mining tasks are also collected as candidates for K-means evaluation. Then, we demonstrate that only three measures, namely the variation of information (VI), the van Dongen criterion (VD) and the Mirkin metric (M), can detect the negative uniform effect of K-means in the clustering results. We also provide new normalization schemes for these three measures, i.e., VI'(norm), VD'(norm) and M'(norm), which enables the cross-data comparisons of clustering qualities. Finally, we explore some properties such as the consistency and sensitivity of the three measures, and give some advice on how to use them in K-means practice. (C) 2008 Elsevier Ltd. All rights reserved.}, owner = {rebeccaspeckman}, timestamp = {2009.04.13} } @ARTICLE{Makuch1989, author = {Makuch, R. W. and Rosenberg, P. S. and Scott, G.}, title = {GOODMAN AND KRUSKAL LAMBDA - A NEW LOOK AT AN OLD MEASURE OF ASSOCIATION}, journal = {Statistics in Medicine}, year = {1989}, volume = {8}, pages = {619-631}, number = {5}, owner = {rebeccaspeckman}, timestamp = {2009.04.13} } @ARTICLE{Goodman1959, author = {Goodman, L. A. and Kruskal, W. H.}, title = {MEASURES OF ASSOCIATION FOR CROSS CLASSIFICATIONS .2. FURTHER DISCUSSION AND REFERENCES}, journal = {Journal of the American Statistical Association}, year = {1959}, volume = {54}, pages = {123-163}, number = {285}, note = {Times Cited: 203}, owner = {rebeccaspeckman}, timestamp = {2009.04.13} } @ARTICLE{Goodman1972, author = {Goodman, L. A. and Kruskal, W. H.}, title = {MEASURES OF ASSOCIATION FOR CROSS CLASSIFICATIONS .4. SIMPLIFICATION OF ASYMPTOTIC VARIANCES}, journal = {Journal of the American Statistical Association}, year = {1972}, volume = {67}, pages = {415-\&}, number = {33}, note = {Times Cited: 148}, owner = {rebeccaspeckman}, timestamp = {2009.04.13} } @ARTICLE{Gray1975, author = {Gray, L. N. and Campbell, R.}, title = {STATISTICAL SIGNIFICANCE OF LAMBDA COEFFICIENTS - A COMMENT}, journal = {Behavioral Science}, year = {1975}, volume = {20}, pages = {258-259}, number = {4}, note = {Times Cited: 1}, owner = {rebeccaspeckman}, timestamp = {2009.04.13} } @ARTICLE{Goodman1963, author = {Goodman, L. A. and Kruskal, W. H.}, title = {MEASURES OF ASSOCIATION FOR CROSS CLASSIFICATIONS .3. APPROXIMATE SAMPLING THEORY}, journal = {Journal of the American Statistical Association}, year = {1963}, volume = {58}, pages = {310-\&}, number = {302}, note = {Times Cited: 273}, owner = {rebeccaspeckman}, timestamp = {2009.04.13} } @ARTICLE{Meila2001, author = {Meila, M. and Heckerman, D.}, title = {An experimental comparison of model-based clustering methods}, journal = {Machine Learning}, year = {2001}, volume = {42}, pages = {9-29}, number = {1-2}, note = {Times Cited: 37}, owner = {rebeccaspeckman}, timestamp = {2009.04.13} } @ARTICLE{Meila2003, author = {Meila, M.}, title = {Comparing clusterings by the variation of information}, journal = {Learning Theory and Kernel Machines}, year = {2003}, volume = {2777}, pages = {173-187}, note = {Times Cited: 7 Scholkopf, B Warmuth, MK 16th Annual Conference on Learning Theory/7th Annual Workshop on Kernel Machines AUG 24-27, 2003 WASHINGTON, D.C.}, owner = {rebeccaspeckman}, timestamp = {2009.04.13} } @ARTICLE{Patrikainen2006, author = {Patrikainen, A. and Meila, M.}, title = {Comparing subspace clusterings}, journal = {Ieee Transactions on Knowledge and Data Engineering}, year = {2006}, volume = {18}, pages = {902-916}, number = {7}, note = {Times Cited: 5}, owner = {rebeccaspeckman}, timestamp = {2009.04.13} } @ARTICLE{Wu2007, author = {Wu, J. J. and Xiong, H. and Chen, J. and Zhou, W. J.}, title = {A generalization of proximity functions for K-means}, journal = {Icdm 2007: Proceedings of the Seventh Ieee International Conference on Data Mining}, year = {2007}, pages = {361-370}, note = {Times Cited: 0 Ramakrishnan, N Zaiane, OR Shi, Y Clifton, CW Wu, XD 7th IEEE International Conference on Data Mining OCT 28-31, 2007 Omaha, NE}, owner = {rebeccaspeckman}, timestamp = {2009.04.13} } @ARTICLE{Meila2007, author = {Meila, M.}, title = {Comparing clusterings - an information based distance}, journal = {Journal of Multivariate Analysis}, year = {2007}, volume = {98}, pages = {873-895}, number = {5}, note = {Times Cited: 11}, owner = {rebeccaspeckman}, timestamp = {2009.04.13} } @ARTICLE{Suich2003, author = {Suich, R. C. and Turek, R. J.}, title = {An asymptotic partial correlation test for the Goodman-Kruskal lambda}, journal = {British Journal of Mathematical \& Statistical Psychology}, year = {2003}, volume = {56}, pages = {111-117}, note = {Part 1}, owner = {rebeccaspeckman}, timestamp = {2009.04.13} } @ARTICLE{Edwardes2000, author = {Edwardes, M. D. D. and Baltzan, M.}, title = {The generalization of the odds ratio, risk ratio and risk difference to r x k tables}, journal = {Statistics in Medicine}, year = {2000}, volume = {19}, pages = {1901-1914}, number = {14}, owner = {rebeccaspeckman}, timestamp = {2009.04.13} } @ARTICLE{Sung1998, author = {Sung, C. S. and Ahn, S. J.}, title = {A proportional-reduction-in-impurity measure of association for categorical variables}, journal = {Communications in Statistics-Theory and Methods}, year = {1998}, volume = {27}, pages = {2083-2110}, number = {8}, note = {Times Cited: 0}, abstract = {This paper presents a proportional-reduction-in-impurity (PRI) measure for categorical association, that employs application-dependent loss functions which make the measure widely applicable. The well-known proportional-reduction-in-error (PRE) measure is shown to be a special case of the new PRI measure. Moreover, the asymptotic variance of the maximum likelihood estimator (MLE) of the measure is derived to facilitate its use for statistical inference. An extension of the PRI measure to compositional association is made to show that it can have a variety of applications. Selected loss functions are treated to illustrate the derivation of the measure.}, owner = {rebeccaspeckman}, timestamp = {2009.04.13} } @ARTICLE{Higham2002, author = {Higham, N. J.}, title = {Computing the nearest correlation matrix - a problem from finance}, journal = {Ima Journal of Numerical Analysis}, year = {2002}, volume = {22}, pages = {329-343}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.04.14} } @ARTICLE{Grubisic2007, author = {Grubisic, I. and Pietersz, R.}, title = {Efficient rank reduction of correlation matrices}, journal = {Linear Algebra and Its Applications}, year = {2007}, volume = {422}, pages = {629-653}, number = {2-3}, owner = {rebeccaspeckman}, timestamp = {2009.04.14} } @ARTICLE{Gerbing1987, author = {Gerbing, D. W. and Anderson, J. C.}, title = {IMPROPER SOLUTIONS IN THE ANALYSIS OF COVARIANCE-STRUCTURES - THEIR INTERPRETABILITY AND A COMPARISON OF ALTERNATE RESPECIFICATIONS}, journal = {Psychometrika}, year = {1987}, volume = {52}, pages = {99-111}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.04.14} } @ARTICLE{Anderson1988, author = {Anderson, J. C. and Gerbing, D. W.}, title = {STRUCTURAL EQUATION MODELING IN PRACTICE - A REVIEW AND RECOMMENDED 2-STEP APPROACH}, journal = {Psychological Bulletin}, year = {1988}, volume = {103}, pages = {411-423}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.04.14} } @INBOOK{Jollois, chapter = {Assessing the number of clusters in the latent class model}, title = {Book}, author = {Jollois, Nadif, and Govaert}, owner = {rebeccaspeckman}, timestamp = {2009.04.14} } @INPROCEEDINGS{Gan2005, author = {Gan, G. J. and Yang, Z. J. and Wu, J. H.}, title = {A genetic k-modes algorithm for clustering categorical data}, booktitle = {1st International Conference on Advanced Data Mining and Applications}, year = {2005}, editor = {Li, X. and Wang, S. and Dong, Z. Y.}, pages = {195-202}, address = {Wuhan, PEOPLES R CHINA}, owner = {rebeccaspeckman}, timestamp = {2009.04.15} } @INCOLLECTION{Luo2006, author = {Luo, H. L. and Kong, F. S. and Li, Y. X.}, title = {Combining multiple clusterings via k-modes algorithm}, booktitle = {Advanced Data Mining and Applications, Proceedings}, year = {2006}, volume = {4093}, series = {Lecture Notes in Artificial Intelligence}, pages = {308-315}, owner = {rebeccaspeckman}, timestamp = {2009.04.15} } @BOOK{Bollen1993, title = {Testing structural equation models}, publisher = {Sage Publications}, year = {1993}, author = {Bollen, Kenneth A. and Long, J. Scott}, address = {Newbury Park}, note = {92037324 Kenneth A. Bollen, J. Scott Long, editors. ill. ; 23 cm. Sage focus editions ; v. 154 Includes bibliographical references.}, keywords = {Social sciences Mathematical models. Social sciences Methodology.}, owner = {rebeccaspeckman}, timestamp = {2009.04.15} } @BOOK{Jain1988, title = {Algorithms for clustering data}, publisher = {Prentice Hall}, year = {1988}, author = {Jain, Anil K. and Dubes, Richard C.}, address = {Englewood Cliffs, N.J.}, note = {87024500 Anil K. Jain, Richard C. Dubes. ill. ; 24 cm. Prentice Hall advanced reference series Bibliography: p. 275-296. Includes indexes.}, keywords = {Cluster analysis Data processing. Computer algorithms.}, owner = {rebeccaspeckman}, timestamp = {2009.04.15} } @INBOOK{Wothke1993, chapter = {Nonpositive definite matrices in structural modeling}, pages = {256-293}, title = {Testing structural equation models}, publisher = {Sage Publications}, year = {1993}, editor = {Bollen, Kenneth A. and Long, J. Scott}, author = {Wothke, Werner}, owner = {rebeccaspeckman}, timestamp = {2009.04.15} } @ARTICLE{Joreskog1967, author = {Joreskog, K. G.}, title = {SOME CONTRIBUTIONS TO MAXIMUM LIKELIHOOD FACTOR ANALYSIS}, journal = {Psychometrika}, year = {1967}, volume = {32}, pages = {443-\&}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.04.15} } @ARTICLE{Lawley1973, author = {Lawley, D. N. and Maxwell, A. E.}, title = {REGRESSION AND FACTOR-ANALYSIS}, journal = {Biometrika}, year = {1973}, volume = {60}, pages = {331-338}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.04.15} } @ARTICLE{Qi2006, author = {Qi, H. D. and Sun, D. F.}, title = {A quadratically convergent Newton method for computing the nearest correlation matrix}, journal = {Siam Journal on Matrix Analysis and Applications}, year = {2006}, volume = {28}, pages = {360-385}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.04.15} } @INPROCEEDINGS{Munoz2006, author = {Munoz, A. and de Diego, I. M.}, title = {From indefinite to positive semi-definite matrices}, booktitle = {Joint International Workshop on Structural, Syntactic, and Statistical Pattern Recognition}, year = {2006}, editor = {Yeung, D. Y. and Kwok, J. T. and Fred, A. and Roli, F. and DeRidder, D.}, pages = {764-772}, address = {Hong Kong, PEOPLES R CHINA}, owner = {rebeccaspeckman}, timestamp = {2009.04.15} } @ARTICLE{Qi2007, author = {Qi, H. D. and Xia, Z. G. and Xing, G. M.}, title = {An application of the nearest correlation matrix on web document classification}, journal = {Journal of Industrial and Management Optimization}, year = {2007}, volume = {3}, pages = {701-713}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.04.15} } @UNPUBLISHED{Yeung2001b, author = {Ka Yee Yeung and Walter L. Ruzzo}, title = {Details of the adjusted Rand index and clustering algorithms}, year = {2001}, issue = {0}, journal = {Unpublished supplement to Bioinformatics paper}, pages = {0}, volume = {0} } @ARTICLE{Hubert1977, author = {Lawrence Hubert}, title = {Nominal scale response agreement as a generalized correlation}, journal = {British Journal of Mathematical and Statistical Psychology}, year = {1977}, volume = {30}, pages = {98--103} } @ARTICLE{Hubert1979, author = {Lawrence Hubert}, title = {Matching models in the analysis of cross-classifications}, journal = {Psychometrika}, year = {1979}, volume = {44}, pages = {21--41}, issue = {1} } @ARTICLE{Tan1982, author = {Tan, E. M. and Cohen, A. S. and Fries, J. F. and Masi, A. T. and McShane, D. J. and Rothfield, N. F. and Schaller, J. G. and Talal, N. and Winchester, R. J.}, title = {The 1982 revised criteria for the classification of systemic lupus erythematosus}, journal = {Arthritis Rheum}, year = {1982}, volume = {25}, pages = {1271-7}, number = {11}, abstract = {The 1971 preliminary criteria for the classification of systemic lupus erythematosus (SLE) were revised and updated to incorporate new immunologic knowledge and improve disease classification. The 1982 revised criteria include fluorescence antinuclear antibody and antibody to native DNA and Sm antigen. Some criteria involving the same organ systems were aggregated into single criteria. Raynaud's phenomenon and alopecia were not included in the 1982 revised criteria because of low sensitivity and specificity. The new criteria were 96% sensitive and 96% specific when tested with SLE and control patient data gathered from 18 participating clinics. When compared with the 1971 criteria, the 1982 revised criteria showed gains in sensitivity and specificity.}, keywords = {Arthritis/etiology Diagnosis, Differential False Positive Reactions Hematologic Diseases/etiology Humans Kidney Diseases/etiology Lupus Erythematosus, Systemic/*classification/diagnosis Mouth Diseases/etiology Nervous System Diseases/etiology Serologic Tests Serositis/etiology Skin Diseases/etiology Ulcer/etiology} } @BOOK{Fletcher1988, title = {Clinical epidemiology : the essentials}, publisher = {Williams \& Wilkins}, year = {1988}, author = {Fletcher, Robert H. and Fletcher, Suzanne W. and Wagner, Edward H.}, address = {Baltimore}, edition = {2nd}, note = {87010445 Robert H. Fletcher, Suzanne W. Fletcher, Edward H. Wagner. ill. ; 23 cm. Includes bibliographies and index.}, keywords = {Clinical epidemiology. Epidemiologic Methods.}, owner = {rebeccaspeckman}, timestamp = {2009.05.25} } @BOOK{Weiss2002, title = {Introductory statistics}, publisher = {Addison-Wesley}, year = {2002}, author = {Weiss, N. A.}, address = {Boston}, edition = {6th}, note = {2001022689 Neil A. Weiss ; biographies by Carol A. Weiss. ill. (some col.) ; 26 cm. + 1 computer optical disc (4 3/4 in.) Includes indexes.}, keywords = {Statistics.}, owner = {rebeccaspeckman}, timestamp = {2009.05.25} } @ARTICLE{Speckman2009a, author = {Speckman, R. A.}, title = {A comparison of $K$-centroids variations for the identification of disease subtypes with presence/absence attributes.}, journal = {in preparation}, year = {2009}, owner = {rebeccaspeckman}, timestamp = {2009.05.28} } @ARTICLE{Akaike1974, author = {Akaike, H.}, title = {NEW LOOK AT STATISTICAL-MODEL IDENTIFICATION}, journal = {Ieee Transactions on Automatic Control}, year = {1974}, volume = {AC19}, pages = {716-723}, number = {6}, owner = {rebeccaspeckman}, timestamp = {2009.06.08} } @ARTICLE{Schwarz1978, author = {Schwarz, G.}, title = {ESTIMATING DIMENSION OF A MODEL}, journal = {Annals of Statistics}, year = {1978}, volume = {6}, pages = {461-464}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.06.08} } @ARTICLE{Kass1995, author = {Kass, R. E. and Raftery, A. E.}, title = {BAYES FACTORS}, journal = {Journal of the American Statistical Association}, year = {1995}, volume = {90}, pages = {773-795}, number = {430}, owner = {rebeccaspeckman}, timestamp = {2009.06.08} } @ARTICLE{Tibshirani1997, author = {Tibshirani, R.}, title = {The lasso method for variable selection in the cox model}, journal = {Statistics in Medicine}, year = {1997}, volume = {16}, pages = {385-395}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.06.08} } @ARTICLE{Volinsky1997, author = {Volinsky, C. T. and Madigan, D. and Raftery, A. E. and Kronmal, R. A.}, title = {Bayesian model averaging in proportional hazard models. Assessing the risk of a stroke}, journal = {Applied Statistics-Journal of the Royal Statistical Society Series C}, year = {1997}, volume = {46}, pages = {433-448}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.06.08} } @ARTICLE{Volinsky2000, author = {Volinsky, C. T. and Raftery, A. E.}, title = {Bayesian information criterion for censored survival models}, journal = {Biometrics}, year = {2000}, volume = {56}, pages = {256-262}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.06.08} } @ARTICLE{Ibrahim1999, author = {Ibrahim, J. G. and Chen, M. H. and MacEachern, S. N.}, title = {Bayesian variable selection for proportional hazards models}, journal = {Canadian Journal of Statistics-Revue Canadienne De Statistique}, year = {1999}, volume = {27}, pages = {701-717}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.06.08} } @ARTICLE{Fan2002, author = {Fan, J. Q. and Li, R. Z.}, title = {Variable selection for Cox's proportional hazards model and frailty model}, journal = {Annals of Statistics}, year = {2002}, volume = {30}, pages = {74-99}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.06.08} } @ARTICLE{Box-Steffensmeier2006, author = {Box-Steffensmeier, J. M. and De Boef, S.}, title = {Repeated events survival models: The conditional frailty model}, journal = {Statistics in Medicine}, year = {2006}, volume = {25}, pages = {3518-3533}, number = {20}, owner = {rebeccaspeckman}, timestamp = {2009.06.08} } @ARTICLE{Liang2008, author = {Liang, H. and Zou, G.}, title = {Improved AIC selection strategy for survival analysis}, journal = {Computational Statistics \& Data Analysis}, year = {2008}, volume = {52}, pages = {2538-2548}, number = {5}, owner = {rebeccaspeckman}, timestamp = {2009.06.08} } @ARTICLE{Lin1994, author = {Lin, D. Y.}, title = {COX REGRESSION-ANALYSIS OF MULTIVARIATE FAILURE TIME DATA - THE MARGINAL APPROACH}, journal = {Statistics in Medicine}, year = {1994}, volume = {13}, pages = {2233-2247}, number = {21}, owner = {rebeccaspeckman}, timestamp = {2009.06.10} } @ARTICLE{Pinner1996, author = {Pinner, R. W. and Teutsch, S. M. and Simonsen, L. and Klug, L. A. and Graber, J. M. and Clarke, M. J. and Berkelman, R. L.}, title = {Trends in infectious diseases mortality in the United States}, journal = {Jama-Journal of the American Medical Association}, year = {1996}, volume = {275}, pages = {189-193}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.06.10} } @ARTICLE{Wilson1998, author = {Wilson, D. and Bhopal, R.}, title = {Impact of infection on mortality and hospitalization in the North East of England}, journal = {Journal of Public Health Medicine}, year = {1998}, volume = {20}, pages = {386-395}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.06.10} } @ARTICLE{Lin2000, author = {Lin, D. Y. and Wei, L. J. and Yang, I. and Ying, Z.}, title = {Semiparametric regression for the mean and rate functions of recurrent events}, journal = {Journal of the Royal Statistical Society Series B-Statistical Methodology}, year = {2000}, volume = {62}, pages = {711-730}, note = {Part 4}, owner = {rebeccaspeckman}, timestamp = {2009.06.10} } @ARTICLE{Lu2005, author = {Lu, T. H.}, title = {Don't overlook infectious diseases in ICD-9 chapters other than Chapter I}, journal = {International Journal of Infectious Diseases}, year = {2005}, volume = {9}, pages = {180-181}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.06.10} } @ARTICLE{Turner1929, author = {Turner, G. G.}, title = {Cancer of the colon - Being the Annual oration of the Medical Society of London, delivered on May 13th, 1929}, journal = {Lancet}, year = {1929}, volume = {1}, pages = {1017-1023}, owner = {rebeccaspeckman}, timestamp = {2009.06.13} } @ARTICLE{Bleicher1949, author = {Bleicher, J. E.}, title = {CANCER OF THE COLON AND RECTUM - A 12-YEAR SURVEY OF 142 CASES IN A GENERAL HOSPITAL}, journal = {Cancer}, year = {1949}, volume = {2}, pages = {25-27}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.06.13} } @ARTICLE{Bader1952, author = {Bader, G. M. and Papanicolaou, G. N.}, title = {THE APPLICATION OF CYTOLOGY IN THE DIAGNOSIS OF CANCER OF THE RECTUM, SIGMOID, AND DESCENDING COLON}, journal = {Cancer}, year = {1952}, volume = {5}, pages = {307-314}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.06.13} } @ARTICLE{Spratt1958, author = {Spratt, J. S. and Ackerman, L. V. and Moyer, C. A.}, title = {RELATIONSHIP OF POLYPS OF THE COLON TO COLONIC CANCER}, journal = {Annals of Surgery}, year = {1958}, volume = {148}, pages = {682-698}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.06.13} } @ARTICLE{Floyd1966, author = {Floyd, C. E. and Stirling, C. T. and Cohn, I.}, title = {CANCER OF COLON RECTUM AND ANUS - REVIEW OF 1,687 CASES}, journal = {Annals of Surgery}, year = {1966}, volume = {163}, pages = {829-\&}, number = {6}, owner = {rebeccaspeckman}, timestamp = {2009.06.13} } @ARTICLE{Cole1966, author = {Cole, W. H. and Roberts, S. S. and Strehl, F. W.}, title = {MODERN CONCEPTS IN CANCER OF COLON AND RECTUM}, journal = {Cancer}, year = {1966}, volume = {19}, pages = {1347-\&}, number = {10}, owner = {rebeccaspeckman}, timestamp = {2009.06.13} } @ARTICLE{Wynder1967, author = {Wynder, E. L. and Shigemat.T}, title = {ENVIRONMENTAL FACTORS OF CANCER OF COLON AND RECTUM}, journal = {Cancer}, year = {1967}, volume = {20}, pages = {1520-\&}, number = {9}, owner = {rebeccaspeckman}, timestamp = {2009.06.13} } @ARTICLE{Wynder1969, author = {Wynder, E. L. and Kajitani, T. and Ishikawa, S. and Dodo, H. and Takano, A.}, title = {ENVIRONMENTAL FACTORS OF CANCER OF COLON AND RECTUM .2. JAPANESE EPIDEMIOLOGICAL DATA}, journal = {Cancer}, year = {1969}, volume = {23}, pages = {1210-\&}, number = {5}, owner = {rebeccaspeckman}, timestamp = {2009.06.13} } @ARTICLE{Lynch1973, author = {Lynch, H. T. and Guirgis, H. and Swartz, M. and Lynch, J. and Krush, A. J. and Kaplan, A. R.}, title = {GENETICS AND COLON CANCER}, journal = {Archives of Surgery}, year = {1973}, volume = {106}, pages = {669-675}, number = {5}, owner = {rebeccaspeckman}, timestamp = {2009.06.13} } @ARTICLE{Wolff1974, author = {Wolff, W. I. and Shinya, H.}, title = {EARLIER DIAGNOSIS OF CANCER OF COLON THROUGH COLONIC ENDOSCOPY (COLONOSCOPY)}, journal = {Cancer}, year = {1974}, volume = {34}, pages = {912-931}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.06.13} } @ARTICLE{Burkitt1971, author = {Burkitt, D. P.}, title = {EPIDEMIOLOGY OF CANCER OF COLON AND RECTUM}, journal = {Cancer}, year = {1971}, volume = {28}, pages = {3-\&}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.06.13} } @ARTICLE{Morson1974, author = {Morson, B. C.}, title = {EVOLUTION OF CANCER OF COLON AND RECTUM}, journal = {Cancer}, year = {1974}, volume = {34}, pages = {845-849}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.06.13} } @ARTICLE{Brenner1990, author = {Brenner, H. and Savitz, D. A.}, title = {THE EFFECTS OF SENSITIVITY AND SPECIFICITY OF CASE SELECTION ON VALIDITY, SAMPLE-SIZE, PRECISION, AND POWER IN HOSPITAL-BASED CASE-CONTROL STUDIES}, journal = {American Journal of Epidemiology}, year = {1990}, volume = {132}, pages = {181-192}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.06.13} } @ARTICLE{Lasky1994, author = {Lasky, T. and Stolley, P. D.}, title = {SELECTION OF CASES AND CONTROLS}, journal = {Epidemiologic Reviews}, year = {1994}, volume = {16}, pages = {6-17}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.06.13} } @ARTICLE{Everitt1995, author = {Everitt, B. S.}, title = {CLASSIFICATION AND CLUSTER-ANALYSIS - COMMENTARY}, journal = {British Medical Journal}, year = {1995}, volume = {311}, pages = {535-536}, number = {7004}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Dollfus1996, author = {Dollfus, S. and Everitt, B. and Ribeyre, J. M. and AssoulyBesse, F. and Sharp, C. and Petit, M.}, title = {Identifying subtypes of schizophrenia by cluster analyses}, journal = {Schizophrenia Bulletin}, year = {1996}, volume = {22}, pages = {545-555}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Jones2002, author = {Jones, E. and Hodgins-Vermaas, R. and McCartney, H. and Everitt, B. and Beech, C. and Poynter, D. and Palmer, I. and Hyams, K. and Wessely, S.}, title = {Post-combat syndromes from the Boer war to the Gulf war: A cluster analysis of their nature and attribution}, journal = {British Medical Journal}, year = {2002}, volume = {324}, pages = {321-324}, number = {7333}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Everitt2002, author = {Everitt, B. and Ismail, K. and David, A. S. and Wessely, S.}, title = {Searching for a Gulf War syndrome using cluster analysis}, journal = {Psychological Medicine}, year = {2002}, volume = {32}, pages = {1371-1378}, number = {8}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Everitt1971, author = {Everitt, B. S. and Gourlay, A. J. and Kendell, R. E.}, title = {ATTEMPT AT VALIDATION OF TRADITIONAL PSYCHIATRIC SYNDROMES BY CLUSTER ANALYSIS}, journal = {British Journal of Psychiatry}, year = {1971}, volume = {119}, pages = {399-\&}, number = {551}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Arnaut-Rollier1999, author = {Arnaut-Rollier, I. and Vauterin, L. and De Vos, P. and Massart, D. L. and Devriese, L. A. and De Zutter, L. and Van Hoof, J.}, title = {A numerical taxonomic study of the Pseudomonas flora isolated from poultry meat}, journal = {Journal of Applied Microbiology}, year = {1999}, volume = {87}, pages = {15-28}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Iruela2002, author = {Iruela, M. and Rubio, J. and Cubero, J. I. and Gil, J. and Millan, T.}, title = {Phylogenetic analysis in the genus {C}icer and cultivated chickpea using {RAPD} and {ISSR} markers}, journal = {Theoretical and Applied Genetics}, year = {2002}, volume = {104}, pages = {643-651}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @INPROCEEDINGS{Bayati2008, author = {Bayati, H. and Davoudi, H. and Fatemizadeh, E.}, title = {A Heuristic Method for Finding the Optimal Number of Clusters with Application in Medical Data}, booktitle = {30th Annual International Conference of the IEEE-Engineering-in-Medicine-and-Biology-Society}, year = {2008}, pages = {4684-4687}, address = {Vancouver, CANADA}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Souto2008, author = {de Souto, M. C. P. and Costa, I. G. and de Araujo, D. S. A. and Ludermir, T. B. and Schliep, A.}, title = {Clustering cancer gene expression data: a comparative study}, journal = {Bmc Bioinformatics}, year = {2008}, volume = {9}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Ibanez2009, author = {Ibanez, M. T. and Caru, M. and Herrera, M. and Gonzalez, L. and Martin, L. and Miranda, J. and Navarro-Cerrillo, R.}, title = {Clones identification of Sequoia sempervirens (D. Don) Endl. in Chile by using PCR-RAPDs technique}, journal = {Journal of Zhejiang University-Science B}, year = {2009}, volume = {10}, pages = {112-119}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Kurihara2009, author = {Kurihara, K. and Welling, M.}, title = {Bayesian k-Means as a "Maximization-Expectation" Algorithm}, journal = {Neural Computation}, year = {2009}, volume = {21}, pages = {1145-1172}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Piot1980, author = {Piot, P. and Vandyck, E. and Goodfellow, M. and Falkow, S.}, title = {A TAXONOMIC STUDY OF GARDNERELLA-VAGINALIS (HEMOPHILUS, VAGINALIS) GARDNER AND DUKES 1955}, journal = {Journal of General Microbiology}, year = {1980}, volume = {119}, pages = {373-\&}, number = {AUG}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Martin1981, author = {Martin, R. and Riley, P. S. and Hollis, D. G. and Weaver, R. E. and Krichevsky, M. I.}, title = {CHARACTERIZATION OF SOME GROUPS OF GRAM-NEGATIVE NONFERMENTATIVE BACTERIA BY THE CARBON SOURCE ALKALINIZATION TECHNIQUE}, journal = {Journal of Clinical Microbiology}, year = {1981}, volume = {14}, pages = {39-47}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Goodfellow1982, author = {Goodfellow, M. and Pirouz, T.}, title = {NUMERICAL CLASSIFICATION OF SPOROACTINOMYCETES CONTAINING MESO-DIAMINOPIMELIC ACID IN THE CELL-WALL}, journal = {Journal of General Microbiology}, year = {1982}, volume = {128}, pages = {503-527}, number = {MAR}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Ridell1983, author = {Ridell, M. and Goodfellow, M.}, title = {NUMERICAL CLASSIFICATION OF MYCOBACTERIUM-FARCINOGENES, MYCOBACTERIUM-SENEGALENSE AND RELATED TAXA}, journal = {Journal of General Microbiology}, year = {1983}, volume = {129}, pages = {599-611}, number = {MAR}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Goodfellow1987, author = {Goodfellow, M. and Harwood, C. R. and Nahaie, M. R.}, title = {IMPACT OF PLASMIDS AND GENETIC CHANGE ON THE NUMERICAL CLASSIFICATION OF STAPHYLOCOCCI}, journal = {Zentralblatt Fur Bakteriologie Mikrobiologie Und Hygiene Series a-Medical Microbiology Infectious Diseases Virology Parasitology}, year = {1987}, volume = {266}, pages = {60-85}, number = {1-2}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Elhag1988, author = {Elhag, K. M. and Senthilselvan, A.}, title = {A SEROGROUPING SCHEME FOR THE STUDY OF THE EPIDEMIOLOGY OF BACTEROIDES-FRAGILIS}, journal = {Journal of Medical Microbiology}, year = {1988}, volume = {27}, pages = {199-205}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Mattioni2002, author = {Mattioni, C. and Casasoli, M. and Gonzalez, M. and Ipinza, R. and Villani, F.}, title = {Comparison of ISSR and RAPD markers to characterize three Chilean Nothofagus species}, journal = {Theoretical and Applied Genetics}, year = {2002}, volume = {104}, pages = {1064-1070}, number = {6-7}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Castro2002, author = {Castro, D. and Pujalte, M. J. and Lopez-Cortes, L. and Garay, E. and Borrego, J. J.}, title = {Vibrios isolated from the cultured manila clam (Ruditapes philippinarum): numerical taxonomy and antibacterial activities}, journal = {Journal of Applied Microbiology}, year = {2002}, volume = {93}, pages = {438-447}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Hahm2003, author = {Hahm, B. K. and Maldonado, Y. and Schreiber, E. and Bhunia, A. K. and Nakatsu, C. H.}, title = {Subtyping of foodborne and environmental isolates of Escherichia coli by multiplex-PCR, rep-PCR, PFGE, ribotyping and AFLP}, journal = {Journal of Microbiological Methods}, year = {2003}, volume = {53}, pages = {387-399}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Drossou2004, author = {Drossou, A. and Katsiotis, A. and Leggett, J. M. and Loukas, M. and Tsakas, S.}, title = {Genome and species relationships in genus Avena based on RAPD and AFLP molecular markers}, journal = {Theoretical and Applied Genetics}, year = {2004}, volume = {109}, pages = {48-54}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Dimitriadou2004, author = {Dimitriadou, E. and Barth, M. and Windischberger, C. and Hornik, K. and Moser, E.}, title = {A quantitative comparison of functional MRI cluster analysis}, journal = {Artificial Intelligence in Medicine}, year = {2004}, volume = {31}, pages = {57-71}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Hartwell2005, author = {Hartwell, S. I. and Claflin, L. W.}, title = {Cluster analysis of contaminated sediment data: Nodal analysis}, journal = {Environmental Toxicology and Chemistry}, year = {2005}, volume = {24}, pages = {1816-1834}, number = {7}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Deutsch2006, author = {Deutsch, R. and Cherner, M. and Grant, I.}, title = {Significance testing of a cluster of multivariate binary variables: comparison of the tripartite T index to three common similarity measures}, journal = {Statistical Methods in Medical Research}, year = {2006}, volume = {15}, pages = {285-299}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @INPROCEEDINGS{Brickle2006, author = {Brickle, P. and MacKenzie, K.}, title = {Parasites as biological tags for Eleginops maclovinus (Teleostei : Eleginopidae) around the Falkland Islands}, booktitle = {11th International Congress of Parasitology (ICOPA XI)}, year = {2006}, pages = {147-153}, address = {Glasgow, SCOTLAND}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Ercisli2008, author = {Ercisli, S. and Orhan, E. and Hizarci, Y. and Yildirim, N. and Agar, G.}, title = {Genetic diversity in grapevine germplasm resources in the Coruh valley revealed by RAPD markers}, journal = {Biochemical Genetics}, year = {2008}, volume = {46}, pages = {590-597}, number = {9-10}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Stenstrom1990, author = {Stenstrom, I. M. and Molin, G.}, title = {CLASSIFICATION OF THE SPOILAGE FLORA OF FISH, WITH SPECIAL REFERENCE TO SHEWANELLA-PUTREFACIENS}, journal = {Journal of Applied Bacteriology}, year = {1990}, volume = {68}, pages = {601-618}, number = {6}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Molin1992, author = {Molin, G. and Johansson, M. L. and Stahl, M. and Ahrne, S. and Andersson, R. and Jeppsson, B. and Bengmark, S.}, title = {SYSTEMATICS OF THE LACTOBACILLUS POPULATION ON RAT INTESTINAL-MUCOSA WITH SPECIAL REFERENCE TO LACTOBACILLUS-REUTERI}, journal = {Antonie Van Leeuwenhoek International Journal of General and Molecular Microbiology}, year = {1992}, volume = {61}, pages = {175-183}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Molin1993, author = {Molin, G. and Jeppsson, B. and Johansson, M. L. and Ahrne, S. and Nobaek, S. and Stahl, M. and Bengmark, S.}, title = {Numerical taxonomy of lactobacillus spp associated with healthy and diseased mucosa of the human intestines}, journal = {Journal of Applied Bacteriology}, year = {1993}, volume = {74}, pages = {314-323}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Ternstrom1993, author = {Ternstrom, A. and Lindberg, A. M. and Molin, G.}, title = {CLASSIFICATION OF THE SPOILAGE FLORA OF RAW AND PASTEURIZED BOVINE-MILK, WITH SPECIAL REFERENCE TO PSEUDOMONAS AND BACILLUS}, journal = {Journal of Applied Bacteriology}, year = {1993}, volume = {75}, pages = {25-34}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Odonnell1993, author = {Odonnell, A. G. and Falconer, C. and Goodfellow, M. and Ward, A. C. and Williams, E.}, title = {BIOSYSTEMATICS AND DIVERSITY AMONGST NOVEL CARBOXYDOTROPHIC ACTINOMYCETES}, journal = {Antonie Van Leeuwenhoek International Journal of General and Molecular Microbiology}, year = {1993}, volume = {64}, pages = {325-340}, number = {3-4}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Yohalem1994, author = {Yohalem, D. S. and Lorbeer, J. W.}, title = {INTRASPECIFIC METABOLIC DIVERSITY AMONG STRAINS OF BURKHOLDERIA-CEPACIA ISOLATED FROM DECAYED ONIONS, SOILS, AND THE CLINICAL ENVIRONMENT}, journal = {Antonie Van Leeuwenhoek International Journal of General and Molecular Microbiology}, year = {1994}, volume = {65}, pages = {111-131}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Johansson1995, author = {Johansson, M. L. and Sanni, A. and Lonner, C. and Molin, G.}, title = {PHENOTYPICALLY BASED TAXONOMY USING API 50CH OF LACTOBACILLI FROM NIGERIAN OGI, AND THE OCCURRENCE OF STARCH FERMENTING STRAINS}, journal = {International Journal of Food Microbiology}, year = {1995}, volume = {25}, pages = {159-168}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Ramser1996, author = {Ramser, J. and LopezPeralta, C. and Wetzel, R. and Weising, K. and Kahl, G.}, title = {Genomic variation and relationships in aerial yam (Dioscorea bulbifera L) detected by random amplified polymorphic DNA}, journal = {Genome}, year = {1996}, volume = {39}, pages = {17-25}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.06.19} } @ARTICLE{Kim2006, author = {Kim, J. and Jacobs, D. R. and Luepker, R. V. and Shahar, E. and Margolis, K. L. and Becker, M. P.}, title = {Prognostic value of a novel classification scheme for heart failure: The Minnesota Heart Failure Criteria}, journal = {American Journal of Epidemiology}, year = {2006}, volume = {164}, pages = {184-193}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.06.29} } @ARTICLE{King2007, author = {King, W. M. and Giess, S. A. and Lombardino, L. J.}, title = {Subtyping of children with developmental dyslexia via bootstrap aggregated clustering and the gap statistic: comparison with the double-deficit hypothesis}, journal = {International Journal of Language \& Communication Disorders}, year = {2007}, volume = {42}, pages = {77-95}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.06.29} } @INPROCEEDINGS{Skeem2004, author = {Skeem, J. and Johansson, P. and Andershed, H. and Kerr, M. and Louden, J. E.}, title = {Two subtypes of psychopathic violent offenders that parallel primary and secondary variants}, booktitle = {4th Conference of the International-Association-of-Forensic-Mental-Health-Services}, year = {2004}, pages = {395-409}, address = {Stockholm, SWEDEN}, owner = {rebeccaspeckman}, timestamp = {2009.06.29} } @ARTICLE{Mun2008, author = {Mun, E. Y. and von Eye, A. and Bates, M. E. and Vaschillo, E. G.}, title = {Finding groups using model-based cluster analysis: Heterogeneous emotional self-regulatory processes and heavy alcohol use risk}, journal = {Developmental Psychology}, year = {2008}, volume = {44}, pages = {481-495}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.06.29} } @ARTICLE{Munson2008, author = {Munson, J. and Dawson, G. and Sterling, L. and Beauchaine, T. and Zhou, A. and Koehler, E. and Lord, C. and Rogers, S. and Sigman, M. and Estes, A. and Abbott, R.}, title = {Evidence for Latent Classes of IQ in Young Children With Autism Spectrum Disorder}, journal = {American Journal on Mental Retardation}, year = {2008}, volume = {113}, pages = {439-452}, number = {6}, owner = {rebeccaspeckman}, timestamp = {2009.06.29} } @MISC{SASgeneral, author = {{SAS}}, title = {Copyright, {SAS} {I}nstitute {I}nc. {SAS} and all other {SAS} {I}nstitute {I}nc. product or service names are registered trademarks or trademarks of {SAS} {I}nstitute {I}nc., {C}ary, {NC}, {USA}.}, year = {2009}, owner = {rebeccaspeckman}, timestamp = {2009.07.04} } @BOOK{Maxcy1998, title = {Maxcy-Rosenau-Last public health \& preventive medicine}, publisher = {Appleton \& Lange}, year = {1998}, author = {Maxcy, Kenneth Fuller and Rosenau, M. J. and Last, John M. and Wallace, Robert B.}, address = {Stamford, Conn.}, edition = {14th}, note = {97053116 editor, Robert B. Wallace. Public health \& preventive medicine Public health and preventive medicine ill., maps ; 29 cm. Includes bibliographical references and index.}, keywords = {Public health. Medicine, Preventive. Public Health Preventive Medicine}, owner = {rebeccaspeckman}, timestamp = {2009.07.04} } @BOOK{, owner = {rebeccaspeckman}, timestamp = {2009.07.04} } @INCOLLECTION{Tyler1998, author = {Tyler, Carl W. and Last, John M.}, title = {Epidemiology}, booktitle = {Maxcy-Rosenau-Last public health \& preventive medicine}, publisher = {Appleton \& Lange}, year = {1998}, editor = {Maxcy, Kenneth Fuller and Rosenau, M. J. and Last, John M. and Wallace, Robert B.}, address = {Stamford, Conn.}, edition = {14th}, owner = {rebeccaspeckman}, timestamp = {2009.07.04} } @INCOLLECTION{Milligan1996, author = {Milligan, Glenn W.}, title = {Clustering validation: {R}esults and implications for applied analyses}, booktitle = {Clustering and classification}, publisher = {World Scientific}, year = {1996}, editor = {Arabie, Phipps and Hubert, Lawrence J. and Soete, Geert de}, pages = {341--375}, address = {Singapore; River Edge, NJ}, keywords = {Cluster analysis. Discriminant analysis.}, owner = {rebeccaspeckman}, timestamp = {2009.07.04} } @ARTICLE{Nylund2007, author = {Nylund, K. L. and Asparoutiov, T. and Muthen, B. O.}, title = {Deciding on the number of classes in latent class analysis and growth mixture modeling: A Monte Carlo simulation study}, journal = {Structural Equation Modeling-a Multidisciplinary Journal}, year = {2007}, volume = {14}, pages = {535-569}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.07.27} } @ARTICLE{Yang2007, author = {Yang, C. C. and Yang, C. C.}, title = {Separating latent classes by information criteria}, journal = {Journal of Classification}, year = {2007}, volume = {24}, pages = {183-203}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.07.27} } @ARTICLE{Nylund2008, author = {Nylund, K. L. and Asparouhov, T. and Muthen, B. O.}, title = {Deciding on the number of classes in latent class analysis and growth mixture modeling: A Monte Carlo simulation study (vol 14, pg 535, 2007)}, journal = {Structural Equation Modeling-a Multidisciplinary Journal}, year = {2008}, volume = {15}, pages = {182-182}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.07.27} } @ARTICLE{Somers2002, author = {Somers, E. and Magder, L. S. and Petri, M.}, title = {Antiphospholipid antibodies and incidence of venous thrombosis in a cohort of patients with systemic lupus erythematosus}, journal = {Journal of Rheumatology}, year = {2002}, volume = {29}, pages = {2531-2536}, number = {12}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Smolen1998, author = {Smolen, J. S. and Steiner, G.}, title = {Mixed connective tissue disease - To be or not to be?}, journal = {Arthritis and Rheumatism}, year = {1998}, volume = {41}, pages = {768-777}, number = {5}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Singh-Manoux2002, author = {Singh-Manoux, A. and Clarke, P. and Marmot, M.}, title = {Multiple measures of socio-economic position and psychosocial health: proximal and distal measures}, journal = {International Journal of Epidemiology}, year = {2002}, volume = {31}, pages = {1192-1199}, number = {6}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Sharp1999, author = {Sharp, G. C. and Hoffman, R. W.}, title = {Clinical, immunologic, and immunogenetic evidence that mixed connective tissue disease is a distinct entity: comment on the article by Smolen and Steiner}, journal = {Arthritis and Rheumatism}, year = {1999}, volume = {42}, pages = {190-191}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Schroeder2001, author = {Schroeder, J. C. and Weinberg, C. R.}, title = {Use of missing-data methods to correct bias and improve precision in case-control studies in which cases are subtyped but subtype information is incomplete}, journal = {American Journal of Epidemiology}, year = {2001}, volume = {154}, pages = {954-962}, number = {10}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Schoenfeld1983, author = {Schoenfeld, D. A.}, title = {SAMPLE-SIZE FORMULA FOR THE PROPORTIONAL-HAZARDS REGRESSION-MODEL}, journal = {Biometrics}, year = {1983}, volume = {39}, pages = {499-503}, number = {2}, note = {ISI Document Delivery No.: RC899 Times Cited: 165 Cited Reference Count: 8}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Schafer2002, author = {Schafer, J. L. and Graham, J. W.}, title = {Missing data: Our view of the state of the art}, journal = {Psychological Methods}, year = {2002}, volume = {7}, pages = {147-177}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Sarwal2003, author = {Sarwal, M. and Chua, M. S. and Kambham, N. and Hsieh, S. C. and Satterwhite, T. and Masek, M. and Salvatierra, O.}, title = {Molecular heterogeneity in acute renal allograft rejection identified by DNA microarray profiling}, journal = {New England Journal of Medicine}, year = {2003}, volume = {349}, pages = {125-138}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Rothschild1983, author = {Rothschild, B. M. and Jones, J. V. and Chesney, C. and Pifer, D. D. and Thompson, L. D. and James, K. K. and Badger, H.}, title = {RELATIONSHIP OF CLINICAL FINDINGS IN SYSTEMIC LUPUS-ERYTHEMATOSUS TO SERO-REACTIVITY}, journal = {Arthritis and Rheumatism}, year = {1983}, volume = {26}, pages = {45-51}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Ramos-Casals2007, author = {Ramos-Casals, M. and Brito-Zeron, P. and Font, J.}, title = {The overlap of Sjogren's syndrome with other systemic autoimmune diseases}, journal = {Seminars in Arthritis and Rheumatism}, year = {2007}, volume = {36}, pages = {246-255}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Pulkstenis2004, author = {Pulkstenis, E. and Robinson, T. J.}, title = {Goodness-of-fit tests for ordinal response regression models}, journal = {Statistics in Medicine}, year = {2004}, volume = {23}, pages = {999-1014}, number = {6}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Petri1991, author = {Petri, M. and Perezgutthann, S. and Longenecker, J. C. and Hochberg, M.}, title = {MORBIDITY OF SYSTEMIC LUPUS-ERYTHEMATOSUS - ROLE OF RACE AND SOCIOECONOMIC-STATUS}, journal = {American Journal of Medicine}, year = {1991}, volume = {91}, pages = {345-353}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Petri1993, author = {Petri, M. and Caffentzis, E. and Conroy, M. and Goldman, D.}, title = {Clinical presentation of systemic lupus erythematosus (SLE), 1960-92}, journal = {Arthritis and Rheumatism}, year = {1993}, volume = {36}, pages = {R22-R22}, number = {5}, note = {Suppl. S}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Petri2000, author = {Petri, M.}, title = {Detection of coronary artery disease and the role of traditional risk factors in the Hopkins Lupus Cohort}, journal = {Lupus}, year = {2000}, volume = {9}, pages = {170-175}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Petri2000a, author = {Petri, M.}, title = {Hopkins Lupus Cohort - 1999 update}, journal = {Rheumatic Disease Clinics of North America}, year = {2000}, volume = {26}, pages = {199-+}, number = {2}, abstract = {The Hopkins Lupus Cohort is a decade-long prospective study, now numbering 800 patients with systemic lupus erythematosus. In this article, predictors of disease activity, disease damage (including accelerated atherosclerosis and antiphospholipid antibody syndrome) and health status are reviewed.}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Petri1997, author = {Petri, M.}, title = {Smoking is a risk factor for musculoskeletal. Pulmonary and cardiac disease in systemic lupus erythematosus}, journal = {Arthritis and Rheumatism}, year = {1997}, volume = {40}, pages = {527-527}, number = {9}, note = {Suppl. S}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Pepe2008, author = {Pepe, M. S. and Feng, Z. and Huang, Y. and Longton, G. and Prentice, R. and Thompson, I. M. and Zheng, Y.}, title = {Integrating the predictiveness of a marker with its performance as a classifier}, journal = {American Journal of Epidemiology}, year = {2008}, volume = {167}, pages = {362-368}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Panush1993, author = {Panush, R. S. and Greer, J. M. and Morshedian, K. K.}, title = {WHAT IS LUPUS - WHAT IS NOT LUPUS}, journal = {Rheumatic Disease Clinics of North America}, year = {1993}, volume = {19}, pages = {223-234}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Okamura1993, author = {Okamura, M. and Kanayama, Y. and Amastu, K. and Negoro, N. and Kohda, S. and Takeda, T. and Inoue, T.}, title = {SIGNIFICANCE OF ENZYME-LINKED-IMMUNOSORBENT-ASSAY (ELISA) FOR ANTIBODIES TO DOUBLE-STRANDED AND SINGLE-STRANDED-DNA IN PATIENTS WITH LUPUS NEPHRITIS - CORRELATION WITH SEVERITY OF RENAL HISTOLOGY}, journal = {Annals of the Rheumatic Diseases}, year = {1993}, volume = {52}, pages = {14-20}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Obuchowski2004, author = {Obuchowski, N. A. and Lieber, M. L. and Wians, F. H.}, title = {ROC curves in Clinical chemistry: Uses, misuses, and possible solutions}, journal = {Clinical Chemistry}, year = {2004}, volume = {50}, pages = {1118-1125}, number = {7}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Muller1992, author = {Muller, H. G.}, title = {GOODNESS-OF-FIT DIAGNOSTICS FOR REGRESSION-MODELS}, journal = {Scandinavian Journal of Statistics}, year = {1992}, volume = {19}, pages = {157-172}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Mukherjee2009, author = {Mukherjee, B. and Liu, I.}, title = {A note on bias due to fitting prospective multivariate generalized linear models to categorical outcomes ignoring retrospective sampling schemes}, journal = {Journal of Multivariate Analysis}, year = {2009}, volume = {100}, pages = {459-472}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Mosca2004, author = {Mosca, M. and Baldini, C. and Bombardieri, S.}, title = {Undifferentiated connective tissue diseases in 2004}, journal = {Clinical and Experimental Rheumatology}, year = {2004}, volume = {22}, pages = {S14-S18}, number = {3}, note = {Suppl. 33}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Miller1983, author = {Miller, M. H. and Urowitz, M. B. and Gladman, D. D.}, title = {THE SIGNIFICANCE OF THROMBOCYTOPENIA IN SYSTEMIC LUPUS-ERYTHEMATOSUS}, journal = {Arthritis and Rheumatism}, year = {1983}, volume = {26}, pages = {1181-1186}, number = {10}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Migliorini2005, author = {Migliorini, P. and Baldini, C. and Rocchi, V. and Bombardieri, S.}, title = {Anti-Sm and anti-RNP antibodies}, journal = {Autoimmunity}, year = {2005}, volume = {38}, pages = {47-54}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{McCarthy1999, author = {McCarthy, N. and Giesecke, J.}, title = {Case-case comparisons to study causation of common infectious diseases}, journal = {International Journal of Epidemiology}, year = {1999}, volume = {28}, pages = {764-768}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{McCarthy, author = {McCarthy}, title = {The existence of MLE in binary logistic regression}, journal = {Source unclear}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @BOOK{Maxcy1998a, title = {Maxcy-Rosenau-Last public health \& preventive medicine}, publisher = {Appleton \& Lange}, year = {1998}, author = {Maxcy, Kenneth Fuller and Rosenau, M. J. and Last, John M. and Wallace, Robert B.}, address = {Stamford, Conn.}, edition = {14th}, note = {97053116 editor, Robert B. Wallace. Public health \& preventive medicine Public health and preventive medicine ill., maps ; 29 cm. Includes bibliographical references and index.}, keywords = {Public health. Medicine, Preventive. Public Health Preventive Medicine}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Gladman1997, author = {Gladman, D. D. and Urowitz, M. B. and Goldsmith, C. H. and Fortin, P. and Ginzler, E. and Gordon, C. and Hanly, J. G. and Isenberg, D. A. and Kalunian, K. and Nived, O. and Petri, M. and SanchezGuerrero, J. and Snaith, M. and Sturfelt, G.}, title = {The reliability of the systemic Lupus International Collaborating Clinics American College of Rheumatology Damage Index in patients with Systemic Lupus Erythematosus}, journal = {Arthritis and Rheumatism}, year = {1997}, volume = {40}, pages = {809-813}, number = {5}, abstract = {Objective. To test the reliability of the Systemic Lupus International Collaborating Clinics/American College of Rheumatology (SLICC/ACR) Damage Indes and the Systemic Lupus Erythematosus Disease Activity Index (SLEDAI) in the assessment of patients,vith SLE. Methods. Ten patients with SLE, representing a spectrum of damage and activity, were included, Each patient was examined by 6 of 10 physicians from 5 countries, representing 10 lupus clinics. The SLICC/ACR Damage Indes was used to assess accumulated damage, and the SLEDAI was used to assess disease activity, The order of the patients and physicians was randomized according to a Youden square design. Results. The SLICC/ACR Damage Index detected differences among patients (P <0.001). There was no detectable observer difference (P=0.933), and there was no order effect (P=0.261). Similar results were obtained with the SLEDAI. There was concordance in the SLICC/ACR Damage Index among observers, despite a wide spectrum of disease activity detected by the SLEDAI. Conclusion. Physicians from different centers are able to assess patients with SLE in a reproducible way, using the SLEDAI to assess disease activity and the SLICC/ACR Damage Index to assess accumulated damage.}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Gladman2003, author = {Gladman, D. D. and Hirani, N. and Ibanez, D. and Urowitz, M. B.}, title = {Clinically active serologically quiescent systemic lupus erythematosus}, journal = {Journal of Rheumatology}, year = {2003}, volume = {30}, pages = {1960-1962}, number = {9}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Gladman1996, author = {Gladman, D. and Ginzler, E. and Goldsmith, C. and Fortin, P. and Liang, M. and Urowitz, M. and Bacon, P. and Bombardieri, S. and Hanly, J. and Hay, E. and Isenberg, D. and Jones, J. and Kalunian, K. and Maddison, P. and Nived, O. and Petri, M. and Richter, M. and SanchezGuerrero, J. and Snaith, M. and Sturfelt, G. and Symmons, D. and Zoma, A.}, title = {The development and initial validation of the systemic lupus international collaborating clinics American College of Rheumatology Damage Index for Systemic Lupus Erythematosus}, journal = {Arthritis and Rheumatism}, year = {1996}, volume = {39}, pages = {363-369}, number = {3}, abstract = {Objective. To develop and perform an initial validation of a damage index for systemic lupus erythematosus (SLE). Methods. A list of items considered to reflect damage in SLE was generated through a nominal group process. A consensus as to which items to be included in an index was reached, together with rules for ascertainment. Each center submitted 2 assessments, 5 years apart, on 2 patients with active and 2 with inactive disease, of whom 1 had increased damage and the other had stable disease. Analysis of variance was used to test the factors physician, time, amount of damage, and activity status. Results. Nineteen physicians completed the damage index on 42 case scenarios. The analysis revealed that the damage index could identify changes in damage seen in patients with both active and inactive disease. Patients who had active disease at both time points had a higher increase in damage. There was good agreement among the physicians on the assessment of damage in these patients. Conclusion. This damage index for SLE records damage occurring in patients with SLE regardless of its cause. The index was demonstrated to have content, face, criterion, and discriminant validity.}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Garcia1996, author = {Garcia, C. O. and Molina, J. F. and GutierrezUrena, S. and Scopelitis, E. and Wilson, W. A. and Gharavi, A. E. and Espinoza, L. R.}, title = {Autoantibody profile in African-American patients with lupus nephritis}, journal = {Lupus}, year = {1996}, volume = {5}, pages = {602-605}, number = {6}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Furnival1974, author = {Furnival, G. M. and Wilson, R. W.}, title = {Regressions by leaps and bounds}, journal = {Technometrics}, year = {1974}, volume = {16}, pages = {499-511}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Font2004, author = {Font, J. and Cervera, R. and Ramos-Casals, M. and Garcia-Carrasco, M. and Sentis, J. and Herrero, C. and del Olmo, J. A. and Darnell, A. and Ingelmo, M.}, title = {Clusters of clinical and immunologic features in systemic lupus erythematosus: Analysis of 600 patients from a single center}, journal = {Seminars in Arthritis and Rheumatism}, year = {2004}, volume = {33}, pages = {217-230}, number = {4}, abstract = {Objective: To analyze the prevalence and characteristics of the main clinical, hematologic, and immunologic manifestations of systemic lupus erythematosus (SLE) in a cohort of 600 consecutive patients from a single center, and to determine the specific characteristics of organ involvement in a homogeneous SLE population. Methods: Patients were consecutively seen in our department either as inpatients or outpatients between 1980 and 2001. All had documented medical histories and underwent a medical interview as well as a routine general physical examination. Clinical and serologic characteristics of all patients were consecutively collected in a protocol form. Results: The final cohort (survival cohort) consisted of 533 (89%) women and 67 (11%) men (female to male ratio, 8:1), with an average of 29 new patients per year. Mean age at onset of symptoms attributable to the disease was 31 years (range, 5 to 84 years) and mean age at the time of diagnosis of SLE was 33 years (range, 6 to 85 years). The most frequent SLE involvement was articular involvement, found in 498 patients (83%), followed by hematologic involvement in 451 patients (75%), specific SLE cutaneous involvement in 354 patients (59%), constitutional features in 252 patients (42%), and nephropathy in 203 patients (34%). Patients enrolled in the protocol before 1991 had a higher frequency of central nervous system (CNS) involvement (27% vs 10%, P < .001), thrombotic events (17% vs 9%, P = .003), and abnormal hematologic parameters (85% vs 66%, P < .01), but a lower frequency of articular involvement (79% vs 86%, P = .038) than those enrolled after 1991. The following were observed associations: specific SLE cutaneous involvement was associated with anti-Sm antibodies; renal involvement with hemolytic anemia and anti-double-sranded DNA antibodies; CNS involvement with thrombocytopenia and immunoglobulin G-anticardiolipin; thrombotic events with low total hemolytic complement, immunoglobulin G-anticardiolipin, and lupus anticoagulant; and myositis with anemia and anti-ribonucleoprotein antibodies. Conclusion: This large study, performed in a single center, has shown cluster associations between certain clinical, hematologic, and immunologic features of SLE, reflecting specific patterns of disease expression. The accurate evaluation of clinical features and laboratory markers at disease diagnosis and during evolution may improve the clinical treatment of SLE patients. (C) 2004 Elsevier Inc. All rights reserved.}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Firth1993, author = {Firth, D.}, title = {Bias reduction of maximum-likelihood estimates}, journal = {Biometrika}, year = {1993}, volume = {80}, pages = {27-38}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Fagerland2008, author = {Fagerland, M. W. and Hosmer, D. W. and Bofin, A. M.}, title = {Multinomial goodness-of-fit tests for logistic regression models}, journal = {Statistics in Medicine}, year = {2008}, volume = {27}, pages = {4238-4253}, number = {21}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Edworthy1988, author = {Edworthy, S. M. and Zatarain, E. and McShane, D. J. and Bloch, D. A.}, title = {Analysis of the 1982 ARA lupus criteria data set by recursive partitioning methodology: new insights into the relative merit of individual criteria}, journal = {Journal of Rheumatology}, year = {1988}, volume = {15}, pages = {1493-1498}, number = {10}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Doria2005, author = {Doria, A. and Mosca, M. and Gambari, P. F. and Bombardieri, S.}, title = {Defining unclassifiable connective tissue diseases: Incomplete, undifferentiated, or both?}, journal = {Journal of Rheumatology}, year = {2005}, volume = {32}, pages = {213-215}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @BOOK{Mausner1985, title = {Epidemiology : an introductory text}, publisher = {Saunders}, year = {1985}, author = {Mausner, Judith S. and Kramer, Shira and Bahn, Anita K.}, address = {Philadelphia}, edition = {2nd}, note = {83020292 Judith S. Mausner, Shira Kramer ; with contributions by Peter Gann, G. Stephen Bowen ; with the collaboration of Richard Morton. ill. ; 23 cm. At head of title: Mausner \& Bahn. Includes bibliographies and index.}, keywords = {Epidemiology.}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Matheny2005, author = {Matheny, M. E. and Ohno-Machado, L. and Resnic, F. S.}, title = {Discrimination and calibration of mortality risk prediction models in interventional cardiology}, journal = {Journal of Biomedical Informatics}, year = {2005}, volume = {38}, pages = {367-375}, number = {5}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Maddison2000, author = {Maddison, P. J.}, title = {Mixed connective tissue disease: overlap syndromes}, journal = {Best Practice \& Research in Clinical Rheumatology}, year = {2000}, volume = {14}, pages = {111-124}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Lin2006, author = {Lin, H. Y. and Myers, L.}, title = {Power and Type I error rates of goodness-of-fit statistics for binomial generalized estimating equations (GEE) models}, journal = {Computational Statistics \& Data Analysis}, year = {2006}, volume = {50}, pages = {3432-3448}, number = {12}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Li2007, author = {Li, Q. Z. and Zhou, J. and Wandstrat, A. E. and Carr-Johnson, F. and Branch, V. and Karp, D. R. and Mohan, C. and Wakeland, E. K. and Olsen, N. J.}, title = {Protein array autoantibody profiles for insights into systemic lupus erythematosus and incomplete lupus syndromes}, journal = {Clinical and Experimental Immunology}, year = {2007}, volume = {147}, pages = {60-70}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Lee2008, author = {Lee, S. S. and Singh, S. and Link, K. and Petri, M.}, title = {High-sensitivity C-reactive protein as an associate of clinical subsets and organ damage in systemic lupus erythematosus}, journal = {Seminars in Arthritis and Rheumatism}, year = {2008}, volume = {38}, pages = {41-54}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Lasko2005, author = {Lasko, T. A. and Bhagwat, J. G. and Zou, K. H. and Ohno-Machado, L.}, title = {The use of receiver operating characteristic curves in biomedical informatics}, journal = {Journal of Biomedical Informatics}, year = {2005}, volume = {38}, pages = {404-415}, number = {5}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Kokori2000, author = {Kokori, S. I. G. and Ioannidis, J. P. A. and Voulgarelis, M. and Tzioufas, A. G. and Moutsopoulos, H. M.}, title = {Autoimmune hemolytic anemia in patients with systemic lupus erythematosus}, journal = {American Journal of Medicine}, year = {2000}, volume = {108}, pages = {198-204}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @BOOK{Kleinbaum2002, title = {Logistic regression : a self-learning text}, publisher = {Springer}, year = {2002}, author = {Kleinbaum, David G. and Klein, Mitchel and Pryor, Erica Rihl}, series = {Statistics for biology and health}, address = {New York}, edition = {2nd}, note = {2002019728 David G. Kleinbaum, Mitchel Klein ; with contributions by Erica Rihl Pryor. ill. ; 25 cm. Includes bibliographical references (p. 503-505) and index.}, keywords = {Medicine Research Statistical methods. Regression analysis. Logistic distribution.}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Demidenko2007, author = {Demidenko, E.}, title = {Sample size determination for logistic regression revisited}, journal = {Statistics in Medicine}, year = {2007}, volume = {26}, pages = {3385-3397}, number = {18}, abstract = {There is no consensus on the approach to compute the power and sample size with logistic regression. Some authors use the likelihood ratio test; some use the test on proportions; some suggest various approximations to handle the multivariate case. We advocate the use of the Wald test since the Z-score is routinely used for statistical significance testing of regression coefficients. The null-variance formula became popular from early studies, which contradicts modern software, which utilizes the method of maximum likelihood estimation (MLE), when the variance of the MLE is estimated at the MLE, not at the null. We derive general Wald-based power and sample size formulas for logistic regression and then apply them to binary exposure and confounder to obtain a closed-form expression. These formulas are applied to minimize the total sample size in a case-control study to achieve a given power by optimizing the ratio of controls to cases. Approximately, the optimal number of controls to cases is equal to the square root of the alternative odds ratio. Our sample size and power calculations can be carried out online at www.dartmouth.edu/ similar to eugened. Copyright (c) 2006 John Wiley & Sons, Ltd.}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Costenbader2004, author = {Costenbader, K. H. and Kim, D. J. and Peerzada, J. and Lockman, S. and Nobles-Knight, D. and Petri, M. and Karlson, E. W.}, title = {Cigarette smoking and the risk of systemic lupus erythematosus - A meta-analysis}, journal = {Arthritis and Rheumatism}, year = {2004}, volume = {50}, pages = {849-857}, number = {3}, abstract = {Objective. Existing studies present conflicting evidence for the role of cigarette smoking as a risk factor in the development of systemic lupus erythematosus (SLE). We performed an extensive search of the medical literature for all studies examining this relationship, and performed a meta-analysis to arrive at a more precise estimate of effect. Methods. We performed a computerized literature search for all studies (in all languages), using Medline and EMBASE (1966 to present) and the Cochrane Collaboration database, and completed hand searches of relevant bibliographies and abstracts of conference proceedings. Several investigators systematically extracted data from the relevant studies. Unpublished data were obtained from the author of one abstract. Studies were examined in aggregate for heterogeneity and publication bias. The relationships of current smoking and past smoking (prior to the onset of SLE) to development of SLE were analyzed separately. Results. Fifty-two studies were identified and chosen for detailed review. Of these, 9 (7 case-control and 2 cohort studies) were appropriate for inclusion in our meta-analyses. For current smokers compared with nonsmokers, the odds ratio (OR) for development of SLE was significantly elevated (OR 1.50, 95% confidence interval [95% CI] 1.09-2.08). Former smokers, compared with nonsmokers, did not demonstrate an increased risk of SLE (OR 0.98, 95% CI 0.75-1.27). Several subgroups were also analyzed. Conclusion. Our meta-analysis of the 7 existing case-control and 2 cohort studies revealed a small but statistically significant association between current smoking and development of SLE. However, no association between past smoking and development of SLE was observed.}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Cook2008, author = {Cook, N. R.}, title = {Statistical evaluation of prognostic versus diagnostic models: Beyond the ROC curve}, journal = {Clinical Chemistry}, year = {2008}, volume = {54}, pages = {17-23}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Cook2007, author = {Cook, N. R.}, title = {Use and misuse of the receiver operating characteristic curve in risk prediction}, journal = {Circulation}, year = {2007}, volume = {115}, pages = {928-935}, number = {7}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Chatterjee2004, author = {Chatterjee, N.}, title = {A two-stage regression model for epidemiological studies with multivariate disease classification data}, journal = {Journal of the American Statistical Association}, year = {2004}, volume = {99}, pages = {127-138}, number = {465}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Cervera1993, author = {Cervera, R. and Khamashta, M. A. and Font, J. and Sebastiani, G. D. and Gil, A. and Lavilla, P. and Domenech, I. and Aydintug, A. O. and Jedrykagoral, A. and Deramon, E. and Galeazzi, M. and Haga, H. J. and Mathieu, A. and Houssiau, F. and Ingelmo, M. and Hughes, G. R. V. and Cervera, R. and Sebastiani, G. D. and Font, J. and Khamashta, M. A. and Hughes, G. R. V. and Font, J. and Cervera, R. and Lopezsoto, A. and Vivancos, J. and Ingelmo, M. and Urbanomarquez, A. and Khamashta, M. A. and Vianna, J. and Hughes, G. R. V. and Gil, A. and Lavilla, P. and Pintado, V. and Lopezdupla, M. and Vazquez, J. J. and Sebastiani, G. D. and Deramon, E. and Camps, M. and Frutos, M. A. and Perello, I. and Santos, P. G. and Abarca, M. and Nebro, A. F. and Domenech, I. and Tokgoz, G. and Aydintug, A. O. and Jedrykagoral, A. and Maldykowa, H. and Chwalinskasadowska, H. and Galeazzi, M. and Haga, H. J. and Mathieu, A. and Houssiau, F.}, title = {Systemic lupus erythematosus: clinical and immunological patterns of disease expression in a cohort of 1,000 patients}, journal = {Medicine}, year = {1993}, volume = {72}, pages = {113-124}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Calvo-Alen1995, author = {Calvo-Alen, J. and Bastian, H. M. and Straaton, K. V. and Burgard, S. L. and Mikhail, I. S. and Alarcon, G. S.}, title = {Identification of patient subsets among those presumptively diagnosed with, referred, and/or followed up for systemic lupus erythematosus at a large teriary care center}, journal = {Arthritis and Rheumatism}, year = {1995}, volume = {38}, pages = {1475-1484}, number = {10}, abstract = {Objective, To identify different subsets of patients from a large tertiary care center who were presumptively referred for and/or diagnosed with systemic lupus erythematosus (SLE) (or followed up), Methods. All patients who were referred, followed up, and/or diagnosed with SLE at our center, who had disease duration of less than or equal to 5 years, and who resided in Alabama, were identified and their charts reviewed and abstracted, Results, Abstracted data were reviewed by 3 rheumatologists, and patients were assigned to 1 of 3 categories: 1) SLE by the American College of Rheumatology (ACR; formerly, the American Rheumatism Association) criteria, 2) clinical SLE but not meeting 4 of the ACR criteria, or 3) fibromyalgia-like manifestations with antinuclear antibody (ANA) positivity, There were 90 patients in the first group (criteria), 22 in the second group (clinical), and 37 in the third group (fibromyalgia-like), Patients in all 3 groups were predominantly women, Only 5% of the fibromyalgia-like group were African-American, compared with 55-65% for the other 2 groups. Organ system involvement occurred with comparable frequency in the first 2 groups, but mucocutaneous and hematologic abnormalities were more frequent in the criteria group; in contrast, the patients with fibromyalgia-like symptoms primarily presented with arthralgias/ myalgias, fatigue, depression, and sleep disturbances, as well as mucocutaneous manifestations, Conclusion, When the ACR criteria for SLE are used to determine eligibility for lupus studies, a group of patients with clinically unequivocal SLE are excluded, A group of patients with fibromyalgia-like manifestations, who test positive for ANA and differ clinically and sociodemographically from the patients in the other 2 groups, very likely do not belong within the spectrum of SLE.}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Bull2002, author = {Bull, S. B. and Mak, C. and Greenwood, C. M. T.}, title = {A modified score function estimator for multinomial logistic regression in small samples}, journal = {Computational Statistics \& Data Analysis}, year = {2002}, volume = {39}, pages = {57-74}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Bull2007, author = {Bull, S. B. and Lewinger, J. P. and Lee, S. S. F.}, title = {Confidence intervals for multinomial logistic regression in sparse data}, journal = {Statistics in Medicine}, year = {2007}, volume = {26}, pages = {903-918}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Kim2004, author = {Kim, K. Y. and Kim, B. J. and Yi, G. S.}, title = {Reuse of imputed data in microarray analysis increases imputation efficiency}, journal = {Bmc Bioinformatics}, year = {2004}, volume = {5}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Kim2007, author = {Kim, D. W. and Lee, K. Y. and Lee, K. H. and Lee, D.}, title = {Towards clustering of incomplete microarray data without the use of imputation}, journal = {Bioinformatics}, year = {2007}, volume = {23}, pages = {107-113}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @INPROCEEDINGS{Kavanaugh1998, author = {Kavanaugh, A. and Tomar, R. and Reveille, J. and Solomon, D. H. and Homburger, H. A.}, title = {Guidelines for clinical use of the antinuclear antibody test and tests for specific autoantibodies to nuclear antigens}, booktitle = {32nd College-of-American-Pathologists Conference}, year = {1998}, pages = {71-81}, address = {Chicago, Illinois}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Kamen2008, author = {Kamen, D. L. and Barron, M. and Parker, T. M. and Shaftman, S. R. and Bruner, G. R. and Aberle, T. and James, J. A. and Scofield, R. H. and Harley, J. B. and Gilkeson, G. S.}, title = {Autoantibody prevalence and lupus characteristics in a unique African American population}, journal = {Arthritis and Rheumatism}, year = {2008}, volume = {58}, pages = {1237-1247}, number = {5}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Jury2001, author = {Jury, E. C. and D'Cruz, D. and Morrow, W. J. W.}, title = {Autoantibodies and overlap syndromes in autoimmune rheumatic disease}, journal = {Journal of Clinical Pathology}, year = {2001}, volume = {54}, pages = {340-347}, number = {5}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Jurencak2009, author = {Jurencak, R. and Fritzler, M. and Tyrrell, P. and Hiraki, L. and Benseler, S. and Silverman, E.}, title = {Autoantibodies in Pediatric Systemic Lupus Erythematosus: Ethnic Grouping, Cluster Analysis, and Clinical Correlations}, journal = {Journal of Rheumatology}, year = {2009}, volume = {36}, pages = {416-421}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Janssens2007, author = {Janssens, Acjw and Moonesinghe, R. and Yang, Q. and Steyerberg, E. W. and van Duijn, C. M. and Khoury, M. J.}, title = {The impact of genotype frequencies on the clinical validity of genomic profiling for predicting common chronic diseases}, journal = {Genetics in Medicine}, year = {2007}, volume = {9}, pages = {528-535}, number = {8}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Jakobsdottir2009, author = {Jakobsdottir, J. and Gorin, M. B. and Conley, Y. P. and Ferrell, R. E. and Weeks, D. E.}, title = {Interpretation of Genetic Association Studies: Markers with Replicated Highly Significant Odds Ratios May Be Poor Classifiers}, journal = {Plos Genetics}, year = {2009}, volume = {5}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Zweig1993, author = {Zweig, M. H. and Campbell, G.}, title = {RECEIVER-OPERATING CHARACTERISTIC (ROC) PLOTS - A FUNDAMENTAL EVALUATION TOOL IN CLINICAL MEDICINE}, journal = {Clinical Chemistry}, year = {1993}, volume = {39}, pages = {561-577}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Isenberg1997, author = {Isenberg, D. A. and Garton, M. and Reichlin, M. W. and Reichlin, M.}, title = {Long-term follow-up of autoantibody profiles in Black female lupus patients and clinical comparison with Caucasian and Asian patients}, journal = {British Journal of Rheumatology}, year = {1997}, volume = {36}, pages = {229-233}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Zonana-Nacach2000, author = {Zonana-Nacach, A. and Barr, S. G. and Magder, L. S. and Petri, M.}, title = {Damage in systemic lupus erythematosus and its association with corticosteroids}, journal = {Arthritis and Rheumatism}, year = {2000}, volume = {43}, pages = {1801-1808}, number = {8}, abstract = {Objective. To evaluate the association between corticosteroid use and organ damage in patients with systemic lupus erythematosus (SLE). Method. The occurrence and date of organ damage, as measured by the Systemic Lupus International Collaborating Clinics/American College of Rheumatology Damage Index, were determined for 539 patients enrolled in the Hopkins Lupus Cohort Study. The risk of damage associated with the cumulative prednisone dose, high-dose prednisone (greater than or equal to 60 mg/day for greater than or equal to 2 months), and pulse methylprednisolone (1,000 mg intravenously for 1-3 days) was estimated using Cox proportional hazards regression analyses, controlling for age, race, and sex. Risk estimates for the cumulative prednisone dose were based on a reference dose of 36.5 gm (e.g., 10 mg of prednisone daily for 10 years [or equivalent]). Results. The cumulative prednisone dose was significantly associated with the development of osteoporotic fractures (relative risk [RR] 2.5, 95% confidence interval [95%, CI] 1.7, 3.7), symptomatic coronary artery disease (RR 1.7, 95% CI 1.1, 2.5), and cataracts (RR 1.9, 95% CI 1.4, 2.5). Each intravenous pulse was associated with a small increase in the risk of osteoporotic fractures (RR 1.3, 95% CI 1.0, 1.8); however, this result failed to reach statistical significance (P = 0.07). Each 2-month exposure to high-dose prednisone was associated with a 1.2-fold increase in the risk of both avascular necrosis (95% CI 1,I, 1.4) and stroke (95% CI 1.0, 1.5). Conclusion. SLE patients receiving long-term prednisone therapy were at significant risk of morbidity due to permanent organ damage. Additional research is required to determine the relative contributions of SLE disease activity and corticosteroids to the pathogenesis of specific types of organ damage. Furthermore, new steroid-sparing therapies are needed in order to treat disease activity and minimize cumulative and high-dose prednisone exposure.}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Isenberg1999, author = {Isenberg, D. and Black, C.}, title = {Naming names! Comment on the article by Smolen and Steiner}, journal = {Arthritis and Rheumatism}, year = {1999}, volume = {42}, pages = {191-193}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Zabaleta-Lanz2006, author = {Zabaleta-Lanz, M. E. and Munoz, L. E. and Tapanes, F. J. and Vargas-Arenas, R. E. and Daboin, I. and Barrios, Y. and Pinto, J. A. and Bianco, N. E.}, title = {Further description of early clinically silent lupus nephritis}, journal = {Lupus}, year = {2006}, volume = {15}, pages = {845-851}, number = {12}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Werth2005, author = {Werth, V. P.}, title = {Clinical manifestations of cutaneous lupus erythematosus}, journal = {Autoimmunity Reviews}, year = {2005}, volume = {4}, pages = {296-302}, number = {5}, abstract = {The skin findings seen in lupus erythematosus can present with either lupus-specific or lupus-nonspecific findings, with lupus-specific skin disease showing findings histopathologically distinct for cutaneous lupus erythematosus. Lupus-specific skin diseases include chronic cutaneous, subacute cutaneous, and acute cutaneous lupus erythematosus. The types of skin lesions in each group are clinically distinct and recognizing the specific subsets helps in prognosticating the likelihood of underlying systemic lupus. A number of medications are associated with cutaneous lupus, in particular with subacute cutaneous lupus erythematosus. Lupus nonspecific skin lesions are not histopathologically distinct for cutaneous lupus and/or may be seen as a feature of another disease process. Nonspecific disease-related skin lesions are frequently seen in patients with SLE, usually in the active phase of the disease. The current ACR classification criteria for SLE include four somewhat overlapping dermatologic criteria, butterfly rash, discoid lupus, photosensitivity, and oral ulcers and thus patients can be classified as having SLE with only skin manifestations. (c) 2005 Elsevier B.V. All rights reserved.}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Bull1997, author = {Bull, S. B. and Greenwood, C. M. T. and Hauck, W. W.}, title = {Jackknife bias reduction for polychotomous logistic regression}, journal = {Statistics in Medicine}, year = {1997}, volume = {16}, pages = {545-560}, number = {5}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Ward1990, author = {Ward, M. M. and Studenski, S.}, title = {CLINICAL MANIFESTATIONS OF SYSTEMIC LUPUS-ERYTHEMATOSUS - IDENTIFICATION OF RACIAL AND SOCIOECONOMIC INFLUENCES}, journal = {Archives of Internal Medicine}, year = {1990}, volume = {150}, pages = {849-853}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @BOOK{Brownson2006, title = {Applied epidemiology : theory to practice}, publisher = {Oxford University Press}, year = {2006}, author = {Brownson, Ross C. and Petitti, Diana B.}, address = {Oxford ; New York}, edition = {2nd}, note = {2005053914 edited by Ross C. Brownson, Diana B. Petitti. ill. ; 25 cm. Includes bibliographical references and index. Epidemiology: a foundation of public health -- Public health surveillance -- Outbreak and cluster investigations -- Systematic reviews in public health -- Epidemiology and risk assessment -- Epidemiologic issues in community intervention -- Outcomes research -- Measuring the quality of health care -- Epidemiology and health policy -- Epidemiology and the law -- Communicating epidemiologic information.}, keywords = {Epidemiology Research Methodology. Epidemiology. Epidemiologic Methods. Health Policy. Quality of Health Care.}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Ward1995, author = {Ward, M. M. and Pyun, E. and Studenski, S.}, title = {LONG-TERM SURVIVAL IN SYSTEMIC LUPUS-ERYTHEMATOSUS - PATIENT CHARACTERISTICS ASSOCIATED WITH POORER OUTCOMES}, journal = {Arthritis and Rheumatism}, year = {1995}, volume = {38}, pages = {274-283}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @BOOK{Brownson1998, title = {Applied epidemiology : theory to practice}, publisher = {Oxford University Press}, year = {1998}, author = {Brownson, Ross C. and Petitti, Diana B.}, address = {New York}, note = {97016894 edited by Ross C. Brownson, Diana B. Petitti. ill. ; 24 cm. Includes bibliographical references and index.}, keywords = {Epidemiology Research Methodology. Epidemiology. Epidemiologic Methods. Quality of Health Care. Health Policy.}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Vila2004, author = {Vila, L. M. and Alarcon, G. S. and McGwin, G. and Friedman, A. W. and Baethge, B. A. and Bastian, H. M. and Fessler, B. J. and Reveille, J. D.}, title = {Early clinical manifestations, disease activity and damage of systemic lupus erythematosus among two distinct US Hispanic subpopulations}, journal = {Rheumatology}, year = {2004}, volume = {43}, pages = {358-363}, number = {3}, abstract = {Objectives. To compare the baseline clinical manifestations, immunological features, disease activity and damage accrual in systemic lupus erythematosus (SLE) patients from two US Hispanic subgroups. Methods. A total of 105 Hispanic SLE patients from Texas (a population of Mexican or Central American ancestry) and 81 from the island of Puerto Rico (all Puerto Ricans) participating in a longitudinal study of outcome were examined. The socio-economic/demographic, clinical and immunological variables were obtained at the time of enrolment (T-0). Disease activity was determined with the Systemic Lupus Activity Measure (SLAM), and disease damage with the Systemic Lupus International Collaborating Clinics (SLICC) Damage Index (SDI). Disease activity was also determined at the time of diagnosis (T-D). Results. At T-0 Hispanics from Texas were younger than those from Puerto Rico (33.1 +/- 12.0 vs 37.5 +/- 11.6 yr, P = 0.0125). Both groups were similar with regard to gender distribution (92.4 vs 95.1% females) and disease duration (1.4 +/- 1.4 vs 1.7 +/- 1.3 yr). Hispanics from Texas were more likely to have serositis (60.0 vs 8.6%, P < 0.0001), renal involvement (41.0 vs 13.6%, P < 0.0001), psychosis (5.7 vs 0.0%, P = 0.0365) and thrombocytopenia (21.0 vs 3.7%, P = 0.0006). On the other hand, Hispanics from Puerto Rico were more likely to have photosensitivity (81.5 vs 41.0%, P < 0.0001), malar rash (65.4 vs 45.7%, P = 0.0074) and discoid rash (13.6 vs 2.9%, P = 0.0060). At baseline, the presence of anti-dsDNA antibodies was higher in Hispanics from Texas (69.5% vs 46.9%, P = 0.0018) while anti-Ro antibodies were more frequent in Hispanics from Puerto Rico (24.7 vs 11.4%, P = 0.0175). Mean SLAM scores at T-D (12.9 +/- 6.4 vs 9.1 +/- 4.6, P < 0.0001) and T-0 (10.9 +/- 6.3 vs 6.6 +/- 3.8, P < 0.0001) were significantly higher in Hispanics from Texas. Similarly, mean SDI scores at T-0 were higher in Hispanics from Texas (0.67 +/- 1.08 vs 0.26 +/- 0.54, P = 0.0026). By stepwise Poisson regression, SDI scores were associated with older age, disease activity and ethnicity (Hispanics from Texas). Conclusions. Early in SLE, marked differences are observed between Hispanics from Texas and Puerto Rico. Higher disease activity, more major organ involvement, higher frequency of anti-dsDNA antibodies and more damage accrual occur in Hispanic lupus patients from Texas than in those from Puerto Rico.}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Bombardier1992, author = {Bombardier, C. and Gladman, D. D. and Urowitz, M. B. and Caron, D. and Chang, C. H.}, title = {DERIVATION OF THE SLEDAI - A DISEASE-ACTIVITY INDEX FOR LUPUS PATIENTS}, journal = {Arthritis and Rheumatism}, year = {1992}, volume = {35}, pages = {630-640}, number = {6}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Biesheuvel2008, author = {Biesheuvel, C. J. and Vergouwe, Y. and Steyerberg, E. W. and Grobbee, D. E. and Moons, K. G. M.}, title = {Polytomous logistic regression analysis could be applied more often in diagnostic research}, journal = {Journal of Clinical Epidemiology}, year = {2008}, volume = {61}, pages = {125-134}, number = {2}, abstract = {Objective: Physicians commonly consider the presence of all differential diagnoses simultaneously. Polytomous logistic regression modeling allows for simultaneous estimation of the probability of multiple diagnoses. We discuss and (empirically) illustrate the value of this method for diagnostic research. Study Design and Setting: We used data from a study on the diagnosis of residual retroperitoneal mass histology in patients presenting with nonseminomatous testicular germ cell tumor. The differential diagnoses include benign tissue, mature teratoma, and viable cancer. Probabilities of each diagnosis were estimated with a polytomous logistic regression model and compared with the probabilities estimated from two consecutive dichotomous logistic regression models. Results: We provide interpretations of the odds ratios derived from the polytomous regression model and present a simple score chart to facilitate calculation of predicted probabilities from the polytomous model. For both modeling methods, we show the calibration plots and receiver operating characteristics curve (ROC) areas comparing each diagnostic outcome category with the other two. The ROC areas for benign tissue, mature teratoma, and viable cancer were similar for both modeling methods, 0.83 (95% confidence interval [CI] = 0.80-0.85) vs. 0.83 (95% CI = 0.80-0.85), 0.78 (95% CI = 0.75-0.81) vs. 0.78 (95% CI = 0.75-0.81), and 0.66 (95% CI = 0.61-0.71) vs. 0.64 (95% CI = 0.59-0.69), for polytomous and dichotomous regression models, respectively. Conclusion: Polytomous logistic regression is a useful technique to simultaneously model predicted probabilities of multiple diagnostic outcome categories. The performance of a polytomous prediction model can be assessed similarly to a dichotomous logistic regression model, and predictions by a polytomous model can be made with a user-friendly method. Because the simultaneous consideration of the presence of multiple (differential) conditions serves clinical practice better than consideration of the presence of only one target condition, polytomous logistic regression could be applied more often in diagnostic research. (C) 2008 Elsevier Inc. All rights reserved.}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Begg1994, author = {Begg, C. B. and Zhang, Z. F.}, title = {STATISTICAL-ANALYSIS OF MOLECULAR EPIDEMIOLOGY STUDIES EMPLOYING CASE-SERIES}, journal = {Cancer Epidemiology Biomarkers \& Prevention}, year = {1994}, volume = {3}, pages = {173-175}, number = {2}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Aringer2005, author = {Aringer, M. and Steiner, G. and Smolen, J. S.}, title = {Does mixed connective tissue disease exist? Yes}, journal = {Rheumatic Disease Clinics of North America}, year = {2005}, volume = {31}, pages = {411-+}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Aringer2007, author = {Aringer, M. and Smolen, J. S.}, title = {Mixed connective tissue disease: what is behind the curtain?}, journal = {Best Practice \& Research in Clinical Rheumatology}, year = {2007}, volume = {21}, pages = {1037-1049}, number = {6}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Arbuckle2003, author = {Arbuckle, M. R. and McClain, M. T. and Rubertone, M. V. and Scofield, R. H. and Dennis, G. J. and James, J. A. and Harley, J. B.}, title = {Development of autoantibodies before the clinical onset of systemic lupus erythematosus}, journal = {New England Journal of Medicine}, year = {2003}, volume = {349}, pages = {1526-1533}, number = {16}, abstract = {BACKGROUND: Although much is known about the natural history of systemic lupus erythematosus (SLE), the development of SLE autoantibodies before the diagnosis of the disease has not been extensively explored. We investigated the onset and progression of autoantibody development before the clinical diagnosis. METHODS: The Department of Defense Serum Repository contains approximately 30 million specimens prospectively collected from more than 5 million U.S. Armed Forces personnel. We evaluated serum samples obtained from 130 persons before they received a diagnosis of SLE, along with samples from matched controls. RESULTS: In 115 of the 130 patients with SLE (88 percent), at least one SLE autoantibody tested was present before the diagnosis (up to 9.4 years earlier; mean, 3.3 years). Antinuclear antibodies were present in 78 percent (at a dilution of 1:120 or more), anti-double-stranded DNA antibodies in 55 percent, anti-Ro antibodies in 47 percent, anti-La antibodies in 34 percent, anti-Sm antibodies in 32 percent, anti-nuclear ribonucleoprotein antibodies in 26 percent, and antiphospholipid antibodies in 18 percent. Antinuclear, antiphospholipid antibodies, anti-Ro, and anti-La antibodies were present earlier than anti-Sm and anti-nuclear ribonucleoprotein antibodies (a mean of 3.4 years before the diagnosis vs. 1.2 years, P=0.005). Anti-double-stranded DNA antibodies, with a mean onset 2.2 years before the diagnosis, were found later than antinuclear antibodies (P=0.06) and earlier than anti-nuclear ribonucleoprotein antibodies (P=0.005). For many patients, the earliest available serum sample was positive; therefore, these measures of the average time from the first positive antibody test to the diagnosis are underestimates of the time from the development of antibodies to the diagnosis. Of the 130 initial matched controls, 3.8 percent were positive for one or more autoantibodies. CONCLUSIONS: Autoantibodies are typically present many years before the diagnosis of SLE. Furthermore, the appearance of autoantibodies in patients with SLE tends to follow a predictable course, with a progressive accumulation of specific autoantibodies before the onset of SLE, while patients are still asymptomatic.}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Arbuckle2003a, author = {Arbuckle, M. R. and James, J. A. and Dennis, G. J. and Rubertone, M. V. and McClain, M. T. and Kim, X. R. and Harley, J. B.}, title = {Rapid clinical progression to diagnosis among African-American men with systemic lupus erythematosus}, journal = {Lupus}, year = {2003}, volume = {12}, pages = {99-106}, number = {2}, abstract = {The initial clinical course of systemic lupus erythematosus (SLE) is variable, ranging from relatively minor manifestations progressing over years to rapid onset of fulminate disease. We sought to identify factors associated with the rapid manifestation of SLE. Chart review of military medical records was used to identify 130 patients who met the American College of Rheumatology classification criteria for SLE. Demographics, clinical criteria date of occurrence, and the date of SLE classification (at least four clinical criteria) met were documented. Prospectively stored serum samples prior to the diagnosis were evaluated for SLE autoantibodies. Median time from the first recorded criteria to diagnosis was significantly shorter in African-American (AA) males compared with AA females and European American (EA) females and males combined. AA males were more likely to have nephritis as their first clinical symptom. Also, less time transpired between the first clinical criterion and SLE diagnosis in AA males with nephritis than in other groups presenting with nephritis. Even when cases presenting with nephritis were excluded, a diagnosis of SLE was made more rapidly in AA males. African-American men progress from initial clinical manifestations to SLE diagnosis more rapidly than other ethnic or gender groups.}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Hoffman2004, author = {Hoffman, I. E. A. and Peene, I. and Meheus, L. and Huizinga, T. W. J. and Cebecauer, L. and Isenberg, D. and De Bosschere, K. and Hulstaert, F. and Veys, E. M. and De Keyser, F.}, title = {Specific antinuclear antibodies are associated with clinical features in systemic lupus erythematosus}, journal = {Annals of the Rheumatic Diseases}, year = {2004}, volume = {63}, pages = {1155-1158}, number = {9}, abstract = {Objectives: To study associations between antinuclear antibodies (ANA) and signs/symptoms in patients with systemic lupus erythematosus (SLE). Methods: A consecutive cohort of 289 patients with SLE was included; 235 fulfilled ACR criteria for SLE and were further analysed. ANA profiles were determined by line immunoassay and by indirect immunofluorescence on Crithidia luciliae. An extensive list of signs/symptoms was evaluated. Results: Five clusters of antibodies were defined by cluster analysis: 1 - antibodies to SmB, SmD, RNP-A, RNP-C, and RNP-70k; 2 - antibodies to Ro52, Ro60, and SSB; 3, 4, and 5 - antibodies to ribosomal P, histones and dsDNA, respectively. Significant associations ( pless than or equal to 0.01) were found between anti-RNP-70k, anti-RNP-A, anti-RNP-C and Raynaud's phenomenon, between anti-RNP-A, anti-RNP-70k and leucopenia, and between anti-RNP-A, anti-RNP-C and a lower prevalence of urine cellular casts. Anti-SSA, anti-SSB were associated with xerostomia, and anti-SSB with pericarditis. Antibodies to ribosomal P were associated with haemolytic anaemia, leucopenia, and alopecia. Patients with anti-dsDNA antibodies had a higher risk for cellular casts and a lower risk for photosensitivity. Antihistone antibodies were associated with arthritis. Conclusions: In a large and consecutive cohort of patients with SLE, clusters of antibodies were identified. Previously reported associations of antibodies with symptoms were confirmed and new associations found.}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Hochberg1985, author = {Hochberg, M. C. and Boyd, R. E. and Ahearn, J. M. and Arnett, F. C. and Bias, W. B. and Provost, T. T. and Stevens, M. B.}, title = {SYSTEMIC LUPUS-ERYTHEMATOSUS - A REVIEW OF CLINICO-LABORATORY FEATURES AND IMMUNOGENETIC MARKERS IN 150 PATIENTS WITH EMPHASIS ON DEMOGRAPHIC SUBSETS}, journal = {Medicine}, year = {1985}, volume = {64}, pages = {285-295}, number = {5}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Hochberg1997, author = {Hochberg, M. C.}, title = {Updating the American College of Rheumatology revised criteria for the classification of systemic lupus erythematosus}, journal = {Arthritis and Rheumatism}, year = {1997}, volume = {40}, pages = {1725-1725}, number = {9}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Hermann2005, author = {Hermann, C. and Westergaard, T. and Pedersen, B. V. and Wohlfahrt, J. and Host, A. and Melbye, M.}, title = {A comparison of risk factors for wheeze and recurrent cough in preschool children}, journal = {American Journal of Epidemiology}, year = {2005}, volume = {162}, pages = {345-350}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Venables2006, author = {Venables, P. J. W.}, title = {Mixed connective tissue disease}, journal = {Lupus}, year = {2006}, volume = {15}, pages = {132-137}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @INPROCEEDINGS{Heinze2005, author = {Heinze, G.}, title = {A comparative investigation of methods for logistic regression with separated or nearly separated data}, booktitle = {26th Annual Meeting of the International-Society-for-Clinical-Biostatistics (ISCB26)}, year = {2005}, pages = {4216-4226}, address = {Szeged, HUNGARY}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @INCOLLECTION{Tyler1998a, author = {Tyler and Last, John M.}, title = {Epidemiology}, booktitle = {Maxcy-Rosenau-Last public health \& preventive medicine}, publisher = {Appleton \& Lange}, year = {1998}, editor = {Maxcy, Kenneth Fuller and Rosenau, M. J. and Last, John M. and Wallace, Robert B.}, pages = {xxviii, 1291 p.}, address = {Stamford, Conn.}, edition = {14th}, note = {97053116 editor, Robert B. Wallace. Public health \& preventive medicine Public health and preventive medicine ill., maps ; 29 cm. Includes bibliographical references and index.}, keywords = {Public health. Medicine, Preventive. Public Health Preventive Medicine}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Hardle1993, author = {Hardle, W. and Mammen, E.}, title = {COMPARING NONPARAMETRIC VERSUS PARAMETRIC REGRESSION FITS}, journal = {Annals of Statistics}, year = {1993}, volume = {21}, pages = {1926-1947}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @INBOOK{Torrence1997, chapter = {Epidemiologic concepts of disease}, title = {Understanding epidemiology}, publisher = {Mosby}, year = {1997}, author = {Torrence, Mary E.}, series = {Mosby's biomedical science series}, address = {St. Louis}, note = {96039467 Mary E. Torrence. ill., maps ; 24 cm. Includes bibliographical references and index.}, keywords = {Epidemiology. Epidemiology.}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Greer1989, author = {Greer, J. M. and Panush, R. S.}, title = {INCOMPLETE LUPUS-ERYTHEMATOSUS}, journal = {Archives of Internal Medicine}, year = {1989}, volume = {149}, pages = {2473-2476}, number = {11}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{To2005, author = {To, C. H. and Petri, M.}, title = {Is antibody clustering predictive of clinical subsets and damage in systemic lupus erythematosus?}, journal = {Arthritis and Rheumatism}, year = {2005}, volume = {52}, pages = {4003-4010}, number = {12}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Greenland1999, author = {Greenland, S.}, title = {A unified approach to the analysis of case-distribution (case-only) studies}, journal = {Statistics in Medicine}, year = {1999}, volume = {18}, pages = {1-15}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Tikly1996, author = {Tikly, M. and Burgin, S. and Mohanlal, P. and Bellingan, A. and George, J.}, title = {Autoantibodies in black South Africans with systemic lupus erythematosus: Spectrum and clinical associations}, journal = {Clinical Rheumatology}, year = {1996}, volume = {15}, pages = {261-265}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Goeman2006, author = {Goeman, J. J. and le Cessie, S.}, title = {A goodness-of-fit test for multinomial logistic regression}, journal = {Biometrics}, year = {2006}, volume = {62}, pages = {980-985}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Terborg1990, author = {Terborg, E. J. and Horst, G. and Hummel, E. J. and Limburg, P. C. and Kallenberg, C. G. M.}, title = {MEASUREMENT OF INCREASES IN ANTI-DOUBLE-STRANDED DNA ANTIBODY-LEVELS AS A PREDICTOR OF DISEASE EXACERBATION IN SYSTEMIC LUPUS-ERYTHEMATOSUS - A LONG-TERM, PROSPECTIVE-STUDY}, journal = {Arthritis and Rheumatism}, year = {1990}, volume = {33}, pages = {634-643}, number = {5}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Terborg1990a, author = {Terborg, E. J. and Groen, H. and Horst, G. and Limburg, P. C. and Wouda, A. A. and Kallenberg, C. G. M.}, title = {CLINICAL ASSOCIATIONS OF ANTIRIBONUCLEOPROTEIN ANTIBODIES IN PATIENTS WITH SYSTEMIC LUPUS-ERYTHEMATOSUS}, journal = {Seminars in Arthritis and Rheumatism}, year = {1990}, volume = {20}, pages = {164-173}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Ananth1997, author = {Ananth, C. V. and Kleinbaum, D. G.}, title = {Regression models for ordinal responses: A review of methods and applications}, journal = {International Journal of Epidemiology}, year = {1997}, volume = {26}, pages = {1323-1333}, number = {6}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Tapanes2000, author = {Tapanes, F. J. and Vasquez, M. and Ramirez, R. and Matheus, C. and Rodriguez, M. A. and Bianco, N.}, title = {Cluster analysis of antinuclear autoantibodies in the prognosis of SLE nephropathy: are anti-extractable nuclear antibodies protective?}, journal = {Lupus}, year = {2000}, volume = {9}, pages = {437-444}, number = {6}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Allison2003, author = {Allison, P. D.}, title = {Missing data techniques for structural equation modeling}, journal = {Journal of Abnormal Psychology}, year = {2003}, volume = {112}, pages = {545-557}, number = {4}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Swanton2005, author = {Swanton, J. and Isenberg, D.}, title = {Mixed connective tissue disease: Still crazy after all these years}, journal = {Rheumatic Disease Clinics of North America}, year = {2005}, volume = {31}, pages = {421-+}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Alarcon2004, author = {Alarcon, G. S. and McGwin, G. and Roseman, J. M. and Uribe, A. and Fessler, B. J. and Bastian, H. M. and Friedman, A. W. and Baethge, B. and Vila, L. M. and Reveille, J. D.}, title = {Systemic lupus erythematosus in three ethnic groups. XIX. Natural history of the accrual of the American College of Rheumatology criteria prior to the occurrence of Criteria diagnosis}, journal = {Arthritis \& Rheumatism-Arthritis Care \& Research}, year = {2004}, volume = {51}, pages = {609-615}, number = {4}, abstract = {Objective. To determine how the American College of Rheumatology (ACR) criteria for the classification of systemic lupus erythematosus (SLE) accrue in a multiethnic cohort of SLE patients. Methods. SLE patients enrolled in a longitudinal study of outcome were analyzed (LUMINA; Lupus in Minorities: Nature versus nurture) for the manner in which ACR criteria manifestations occurred prior to the accrual of 4 of them. Time at which a criterion was said to be present was determined by review of all previously available medical records and interview. Univariable and multivariable Cox proportional hazard models were examined for the association with time to accrual of 4 ACR criteria; results were reported as hazard ratios. Results. There were 103 Texas Hispanic (of Mexican or Central America ancestry) patients, 55 Puerto Rico Hispanics, 176 African Americans, and 137 Caucasians. The mean +/- SD and median (range) time to accrual of 4 ACR criteria were 29.4 +/- 52.0 months and 9.1 (0 - 328.7) months; time was shortest for the Texas Hispanics (18.4 +/- 42.8 and 5.0 [0 - 248] months) and longest for the Caucasians (39.9 +/- 59.3 months and 17.7 [0 - 324.6] months). Arthritis was the most frequent first criterion (34.5%); it was followed by photosensitivity (18.8%). When 2 criteria occurred from the outset, the most frequent combination was arthritis and antinuclear antibody positivity followed by malar rash and photosensitivity. A Cox-regression multivariable model identified Hispanic ethnicity (from Texas) and HLA-DRB1*0301 as predictors of short time to criteria accrual, whereas older age and married/living together were associated with long time to criteria accrual. Conclusion. Significant variability in the evolution of ACR criteria manifestations does occur. Texas Hispanics are more likely to have a rapid evolution of criteria manifestations, but several years may elapse before ACR criteria are accrued.}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Staniswalis1991, author = {Staniswalis, J. G. and Severini, T. A.}, title = {DIAGNOSTICS FOR ASSESSING REGRESSION-MODELS}, journal = {Journal of the American Statistical Association}, year = {1991}, volume = {86}, pages = {684-692}, number = {415}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Alarcon1999, author = {Alarcon, G. S. and Friedman, A. W. and Straaton, K. V. and Moulds, J. M. and Lisse, J. and Bastian, H. M. and McGwin, G. and Bartolucci, A. A. and Roseman, J. M. and Reveille, J. D.}, title = {Systemic lupus erythematosus in three ethnic groups: III A comparison of characteristics early in the natural history of the LUMINA cohort}, journal = {Lupus}, year = {1999}, volume = {8}, pages = {197-209}, number = {3}, abstract = {Aim: To determine and contrast the socioeconomic-demographic and clinical features of patients with recent onset (less than or equal to 5 y) systemic lupus erythematosus (SLE) from three ethnic groups, Hispanic, African-American and Caucasian (H, AA, C). Subjects and methods: SLE cases (American College of Rheumatology criteria) (incident (n = 56), prevalent (n = 173)), were enrolled in a longitudinal study at The University of Alabama at Birmingham, The University of Texas-Houston Health Science Center and The University of Texas Medical Branch at Galveston. Socioeconomic-demographic, clinical, immunological, behavioral and psychological data were obtained using validated instruments and standard laboratory techniques, and compared. Results: 70 H, 88 AA and 71 C SLE patients constitute this cohort. H and AA patients were younger and of lower sacioeconomic-demographic status. They also had evidence of more frequent organ system involvement (renal, cardiovascular), more auto-antibodies, more active disease (after adjusting for discrepant socioeconomic-demographic features), lower levels of social support and more abnormal illness-related behaviors (more in H than in AA). H also were more likely to have an abrupt disease onset; C were more likely to be on antimalarials but less likely to be on corticasteroids. H, AA, and C used health care resources comparably. They had similar levels of pain and physical and mental functioning after adjusting for age, disease duration, income, education, social support, illness-related behaviors, and Systemic Lupus Activity Measure or SLAM scores. Conclusions: H and AA patients have more active SLE, at an earlier age of onset, and a less favorable socioeconomic-demographic structure (worse among the H than AA) which predispose them to a less favorable natural history.}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Alarcon2000, author = {Alarcon, G. S.}, title = {Unclassified or undifferentiated connective tissue disease}, journal = {Best Practice \& Research in Clinical Rheumatology}, year = {2000}, volume = {14}, pages = {125-137}, number = {1}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Stallard2009, author = {Stallard, N.}, title = {Simple tests for the external validation of mortality prediction scores}, journal = {Statistics in Medicine}, year = {2009}, volume = {28}, pages = {377-388}, number = {3}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Sontheimer2005, author = {Sontheimer, R. D.}, title = {Subacute cutaneous lupus erythematosus: 25-year evolution of a prototypic subset (subphenotype) of lupus erythematosus defined by characteristic cutaneous, pathological, immunological, and genetic findings}, journal = {Autoimmunity Reviews}, year = {2005}, volume = {4}, pages = {253-263}, number = {5}, owner = {rebeccaspeckman}, timestamp = {2009.11.10} } @ARTICLE{Hochberg1993, author = {Hochberg, Marc C.}, title = {The history of lupus erythematosus}, journal = {Lupus Foundation of America Newsletter}, year = {1993}, owner = {rebeccaspeckman}, series = {Lupus Foundation of America Newsletter}, timestamp = {2009.11.10} } @BOOK{Mooney1997, title = {Monte Carlo simulation}, publisher = {Sage Publications}, year = {1997}, author = {Mooney, Christopher Z.}, series = {Sage university papers series. Quantitative applications in the social sciences}, address = {Thousand Oaks, Calif.}, note = {96045873 Christopher Z. Mooney. ill. ; 22 cm. Includes bibliographical references (p. 99-110).}, keywords = {Monte Carlo method.}, owner = {rebeccaspeckman}, timestamp = {2009.11.12} } @BOOK{Klippel2001, title = {Primer on the rheumatic diseases}, publisher = {Arthritis Foundation}, year = {2001}, author = {Klippel, John H.}, address = {Atlanta, GA}, edition = {12th}, note = {[edited by] John H. Klippel, MD ... [et al.]. cm.}, owner = {rebeccaspeckman}, timestamp = {2009.12.07} } @comment{jabref-meta: selector_journal:} @comment{jabref-meta: selector_author:} @comment{jabref-meta: selector_keywords:} @comment{jabref-meta: selector_publisher:} @comment{jabref-meta: groupsversion:3;} @comment{jabref-meta: groupstree: 0 AllEntriesGroup:; 1 ExplicitGroup:Mixture modelling\;0\;Fraley1999\;Fraley2002\;McLachla n2004\;Symons1981\;Tikka2007\;; 2 ExplicitGroup:Bernoulli mixture models\;0\;Carreira-Perpinan2000\;Gr im2006\;Grim2007\;Juan2001\;Juan2004\;Juan2004a\;Seppanen2003\;; 2 ExplicitGroup:relevant?\;0\;Todem2007\;; 2 ExplicitGroup:MCMC LCA/MM\;0\;Garrett2000\;Zeger2003\;; 2 ExplicitGroup:conditional dependence models\;0\;; 2 ExplicitGroup:non-identifiability\;0\;DeBacker1999\;; 2 ExplicitGroup:local maxima\;0\;; 1 ExplicitGroup:Data generation\;0\;Cary\;Krummenauer1998\;Makoto/Taku ji\;Qiu2006\;Savicky2006\;Scott1999\;Steinley2005\;Waller1999\;; 2 ExplicitGroup:Binary data generation\;0\;Chaganty2006\;Gange1995\;Jo e2004\;Joe2006\;Leisch1998\;Lunn1998\;Park1996\;Snijders1991\;; 1 ExplicitGroup:Similarity measures\;0\;Baroni-Urbani1976\;Jackson1989 \;Real1996\;Su2001\;; 1 ExplicitGroup:Cluster analysis\;0\;Dubes1976\;Everitt1979\;He2005\;H e2008\;; 2 ExplicitGroup:MC comparisons/studies\;0\;Bayne1980\;Dolcinar1998\;Do lcinar1998a\;Milligan1981a\;Milligan1983\;; 2 ExplicitGroup:Binary data\;0\;; 2 ExplicitGroup:Reviews\;0\;Milligan1987\;Steinley2006a\;; 2 ExplicitGroup:Misc\;0\;Jain1987\;; 2 ExplicitGroup:Relation to prob methods\;0\;Banfield1993\;Bryant1991\ ;Scott1971\;Steinley2006a\;Symons1981\;; 2 ExplicitGroup:Non-Kmeans-ish methods\;0\;Cesario2007\;Dutta2005\;Edw ards1965\;Gowda1992\;Guha2000\;Orloci1967\;Shatovska2008\;Song2006\;Va nMechelen2004\;Yeung2001a\;Yin2008\;; 2 ExplicitGroup:Alternative optimization criteria\;0\;Friedman1967\;Ma ronna1974\;Marriott1971\;Marriott1982\;Scott1971\;Steinley2006a\;Windh am1987\;; 2 ExplicitGroup:K-centroids\;0\;Baroni-Urbani1976\;Huang1998\;McRae197 1\;Steinley2003\;Steinley2004a\;Steinley2006a\;Steinley2007a\;Steinley 2008\;Steinley2008a\;Su2001\;; 2 ExplicitGroup:Fuzzy clustering\;0\;Manton2004\;; 2 ExplicitGroup:clustering tendency\;0\;Panayirci1983\;; 2 ExplicitGroup:Cluster validation\;0\;Bezdek1998\;Brusco2007a\;Datta2 003\;Dimitriadou2002\;Dubes1979\;Dubes1987\;Irigoien2008\;Jaksic1990\; Lee1979\;; 3 ExplicitGroup:bootstrap, MC etc.\;0\;Bertrand2006\;Kuncheva2006\;Nem ec1988\;Steinley2006\;Tibshirani2001\;Tonidandel2004\;; 3 ExplicitGroup:Null models\;0\;Gordon1994\;Hunter2004\;Jackson1992\;M atthiessen2003\;Tibshirani2001\;Vassiliou1989\;; 3 ExplicitGroup:reviews (some have MC)\;0\;Halkidi\;Handl2005\;Milliga n1981\;Milligan1985\;; 3 ExplicitGroup:cross-validation\;0\;Kapp2007\;Krieger1999\;; 3 ExplicitGroup:methods\;0\;Krzanowski1988\;Loganantharaj2006\;McShane 2002\;Steinley2007\;Strauss1982\;Yeung2001\;; 3 ExplicitGroup:ARI etc.\;0\;Brennan1974\;Fowlkes1983\;Hubert1985\;Kla storin1985\;Mantel1967\;Morey1984\;Rand1971\;Steinley2004\;; 1 ExplicitGroup:Classification (to known classes)\;0\;Guvenir1998\;; 1 ExplicitGroup:Disease applications\;0\;Guvenir1998\;; 1 ExplicitGroup:Bayesian methods\;0\;Casella1992\;Chib1995\;; 1 ExplicitGroup:Neural networks\;0\;DeBacker1999\;; 1 ExplicitGroup:Numerical methods (?)\;0\;Dempster1977\;; }