@misc{brandes_graphml_2005, title = {{GraphML} Primer}, url = {graphml.graphdrawing.org/primer/graphml-primer.html}, abstract = {{GraphML} Primer is a non-normative document intended to provide an easily readable description of the {GraphML} facilities, and is oriented towards quickly understanding how to create {GraphML} documents. This primer describes the language features through examples which are complemented by references to normative texts.}, journal = {Graph Drawing}, author = {Brandes, U. and Eiglsperger, M. and Lerner, J.}, year = {2005}, keywords = {graphs}, howpublished = {graphml.graphdrawing.org/primer/graphml-primer.html} }, @article{chidamber_metrics_1994, title = {A metrics suite for object oriented design}, volume = {20}, issn = {0098-5589}, abstract = {Given the central role that software development plays in the delivery and application of information technology, managers are increasingly focusing on process improvement in the software development area. This demand has spurred the provision of a number of new and/or improved approaches to software development, with perhaps the most prominent being object-orientation {(OO).} In addition, the focus on process improvement has increased the demand for software measures, or metrics with which to manage the process. The need for such metrics is particularly acute when an organization is adopting a new technology for which established practices have yet to be developed. This research addresses these needs through the development and implementation of a new suite of metrics for {OO} design. Metrics developed in previous research, while contributing to the field's understanding of software development processes, have generally been subject to serious criticisms, including the lack of a theoretical base. Following Wand and Weber (1989), the theoretical base chosen for the metrics was the ontology of Bunge (1977). Six design metrics are developed, and then analytically evaluated against Weyuker's (1988) proposed set of measurement principles. An automated data collection tool was then developed and implemented to collect an empirical sample of these metrics at two field sites in order to demonstrate their feasibility and suggest ways in which managers may use these metrics for process improvement}, number = {6}, journal = {{IEEE} Transactions on Software Engineering}, author = {Chidamber, {S.R.} and Kemerer, {C.F.}}, year = {1994}, keywords = {cohesion, coupling, metrics, {OOD}, {OOP}}, pages = {476--493}, annote = {only directly accessed attributes, unclear about inherited methods and attributes} }, @article{jain_data_1999, title = {Data clustering: a review}, volume = {31}, shorttitle = {Data clustering}, url = {http://citeseer.ist.psu.edu/jain99data.html}, abstract = {This paper presents an overview of pattern clustering methods from a statistical pattern recognition perspective, with a goal of providing useful advice and references to fundamental concepts accessible to the broad community of clustering practitioners. We present a taxonomy of clustering techniques, and identify cross-cutting themes and recent advances. We also describe some important applications of clustering algorithms such as image segmentation, object recognition, and information...}, number = {3}, journal = {{ACM} Computing Surveys}, author = {Jain, {AK} and Murty, {MN} and Flynn, {PJ}}, year = {1999}, keywords = {clustering, data mining, survey}, pages = {264--323}, annote = {This is one of my favorite papers for discussing the pluses and minuses of various clustering techniques. Relevance: 4} }, @incollection{logozzo_semantic_2006, title = {Semantic Hierarchy Refactoring by Abstract Interpretation}, url = {http://dx.doi.org/10.1007/11609773_21}, abstract = {A semantics-based framework is presented for the definition and manipulation of class hierarchies for object-oriented languages. The framework is based on the notion of observable of a class, i.e., an abstraction of its semantics when focusing on a behavioral property of interest. We define a semantic subclass relation, capturing the fact that a subclass preserves the behavior of its superclass up to a given (tunable) observed property. We study the relation between syntactic subclass, as present in mainstream object-oriented languages, and the notion of semantic subclass. The approach is then extended to class hierarchies, leading to a semantics-based modular treatment of a suite of basic observable-preserving operators on hierarchies. We instantiate the framework by presenting effective algorithms that compute a semantic superclass for two given classes, that extend a hierarchy with a new class, and that merge two hierarchies by preserving semantic subclass relations.}, booktitle = {Verification, Model Checking, and Abstract Interpretation}, author = {Logozzo, Francesco and Cortesi, Agostino}, year = {2006}, keywords = {inheritance, refactoring}, pages = {313--331} }, @book{coad_object-oriented_1991, edition = {1}, title = {{Object-Oriented} Design}, isbn = {0136300707}, publisher = {Prentice Hall}, author = {Coad, Peter and Yourdon, Edward}, month = jun, year = {1991} }, @article{lenat_cyc:_1995, title = {{CYC:} a large-scale investment in knowledge infrastructure}, volume = {38}, issn = {0001-0782}, shorttitle = {{CYC}}, url = {http://doi.acm.org/10.1145/219717.219745}, doi = {10.1145/219717.219745}, number = {11}, journal = {Commun. {ACM}}, author = {Lenat, Douglas B.}, month = nov, year = {1995}, pages = {33{\textendash}38} }, @article{al_dallal_precise_2010, title = {A precise method-method interaction-based cohesion metric for object-oriented classes}, abstract = {The building of highly cohesive classes is an important objective in object-oriented design. Class cohesion refers to the relatedness of the class members, and it indicates one important aspect of the class design quality. A meaningful class cohesion metric helps object-oriented software developers detect class design weaknesses and refactor classes accordingly. Several class cohesion metrics have been proposed in the literature. Most of these metrics are applicable based on low-level design information such as attribute references in methods. Some of these metrics capture class cohesion by counting the number of method pairs sharing common attributes. A few metrics measure cohesion more precisely by considering the degree of interaction, through attribute references, between each pair of methods. However, the formulas applied by these metrics to measure the degree of interaction cause the metrics to violate important mathematical properties, thus undermining their construct validity and leading to misleading cohesion measurement. In this paper, we propose a formula that precisely measures the degree of interaction between each pair of methods, and we use it as a basis to introduce a low-level design class cohesion metric {(LSCC).} We verify that the proposed formula does not cause the metric to violate important mathematical properties. In addition, we provide a mechanism to use this metric as a useful indicator for refactoring weakly cohesive classes, thus showing its usefulness in improving class cohesion. Finally, we empirically validate {LSCC.} Using four open source software systems and eleven cohesion metrics, we investigate the relationship between {LSCC}, other cohesion metrics, and fault occurrences in classes. Our results show that {LSCC} is one of three metrics that explains more accurately the presence of faults in classes. {LSCC} is the only one among the three metrics to comply with important mathematical properties, and statistical analysis shows it captures a measurement dimension of its own. This suggests that {LSCC} is a better alternative, when taking into account both theoretical and empirical results, as a measure to guide the refactoring of classes. From a more general standpoint, the results suggest that class quality, as measured in terms of fault occurrences, can be more accurately explained by cohesion metrics that account for the degree of interaction between each pair of methods.}, journal = {{ACM} Transactions on Software Engineering and Methodology {(TOSEM)}}, author = {Al Dallal, J. and Briand, L.}, year = {2010} }, @inproceedings{balazinska_advanced_2000, address = {Washington, {DC}, {USA}}, title = {Advanced {Clone-Analysis} to Support {Object-Oriented} System Refactoring}, isbn = {0-7695-0881-2}, booktitle = {{WCRE} '00: Proceedings of the Seventh Working Conference on Reverse Engineering {(WCRE'00)}}, publisher = {{IEEE} Computer Society}, author = {Balazinska, Magdalena and Merlo, Ettore and Dagenais, Michel and Lag\"{u}e, Bruno and Kontogiannis, Kostas}, year = {2000}, keywords = {clones, refactoring}, pages = {98} }, @article{girvan_community_2002, title = {Community structure in social and biological networks.}, volume = {99}, issn = {0027-8424}, abstract = {A number of recent studies have focused on the statistical properties of networked systems such as social networks and the Worldwide Web. Researchers have concentrated particularly on a few properties that seem to be common to many networks: the small-world property, power-law degree distributions, and network transitivity. In this article, we highlight another property that is found in many networks, the property of community structure, in which network nodes are joined together in tightly knit groups, between which there are only looser connections. We propose a method for detecting such communities, built around the idea of using centrality indices to find community boundaries. We test our method on computer-generated and real-world graphs whose community structure is already known and find that the method detects this known structure with high sensitivity and reliability. We also apply the method to two networks whose community structure is not well known--a collaboration network and a food web--and find that it detects significant and informative community divisions in both cases.}, number = {12}, journal = {Proc Natl Acad Sci U S A}, author = {Girvan, Michelle and Newman, {ME}}, month = jun, year = {2002}, keywords = {betweenness, clustering, {SNA}}, annote = {http://dx.doi.org/10.1073/pnas.122653799}, annote = {References a Newman betweenness algorithm which has a later published erratum.} }, @article{al_dallal_software_2009, title = {Software similarity-based functional cohesion metric}, volume = {3}, url = {http://cat.inist.fr/?aModele=afficheN&cpsidt=21476664}, abstract = {Cohesion is an important factor used in evaluating software design quality and modularity. The cohesion of a module refers to the relatedness of the module components. In software engineering, highly cohesive modules are highly desirable because of their high reusability and maintainability. Cohesion is classified according to levels. Functional cohesion, the strongest level, refers to how closely the module parts that contribute to different outputs are related. Here, a similarity-based functional cohesion {(SBFC)} metric is introduced to measure the functional cohesion of a module in a procedural or object-oriented program. The metric uses the degree of similarity between the data slices of the module as a basis to measure functional cohesion. The appropriateness of the metric is evaluated both theoretically and empirically. The evaluation results show that the metric does as well as some earlier metrics in indicating the level of cohesiveness and it does better than some in terms of providing different values for the modules of different cohesion. In addition, the {SBFC} metric is used as an indicator for restructuring the weakly cohesive modules.}, number = {1}, journal = {Software, {IET}}, author = {Al Dallal, J.}, month = feb, year = {2009}, keywords = {cohesion, metrics, similarity, slicing}, pages = {45 -- 57} }, @article{briand_controlled_2001, title = {A controlled experiment for evaluating quality guidelines on the maintainability of object-oriented designs}, volume = {27}, issn = {0098-5589}, abstract = {The paper presents a controlled experiment, focusing on the impact of applying quality design principles such as the ones provided by P. Coad and E. Yourdon (1991) on the maintainability of object oriented designs. Results, which repeat the findings of a previous study, strongly suggest that such design principles have a beneficial effect on the maintainability of object oriented designs. It is argued that object oriented designs are sensitive to poor design practices because the cognitive complexity introduced becomes increasingly unmanageable. However, as our ability to generalize these results is limited, they should be considered as preliminary, i.e., it is very likely that they can only be generalized to programmers with little object oriented training and programming experience. Such programmers can, however, be commonly found on maintenance projects. As well as additional research, external replications of this study are required to confirm the results and achieve confidence in these findings}, number = {6}, journal = {{IEEE} Transactions on Software Engineering}, author = {Briand, {L.C.} and Bunse, C. and Daly, {J.W.}}, year = {2001}, keywords = {cognitive complexity, human factors, maintainability, maintenance, {OOP}}, pages = {513--530} }, @techreport{dongen_performance_2000, title = {Performance criteria for graph clustering and Markov cluster experiments}, url = {http://portal.acm.org/citation.cfm?id=868979}, abstract = {In{\textasciitilde}[1] a cluster algorithm for graphs was introduced called the Markov cluster algorithm or {MCL{\textasciitilde}algorithm.} The algorithm is based on simulation of (stochastic) flow in graphs by means of alternation of two operators, expansion and inflation. The results in{\textasciitilde}[2] establish an intrinsic relationship between the corresponding algebraic process {(MCL{\textasciitilde}process)} and cluster structure in the iterands and the limits of the process. Several kinds of experiments conducted with the {MCL{\textasciitilde}algorithm} are described here. Test cases with varying homogeneity characteristics are used to establish some of the particular strengths and weaknesses of the algorithm. In general the algorithm performs well, except for graphs which are very homogeneous (such as weakly connected grids) and for which the natural cluster diameter (i.e. the diameter of a subgraph induced by a natural cluster) is large. This can be understood in terms of the flow characteristics of the {MCL{\textasciitilde}algorithm} and the heuristic on which the algorithm is grounded. A generic performance criterion for clusterings of weighted graphs is derived, by a stepwise refinement of a simple and appealing criterion for simple graphs. The most refined criterion uses a particular Schur convex function, several properties of which are established. A metric is defined on the space of partitions, which is useful for comparing different clusterings of the same graph. The metric is compared with the metric known as the equivalence mismatch coefficient. The performance criterion and the metric are used for the quantitative measurement of experiments conducted with the {MCL{\textasciitilde}algorithm} on randomly generated test graphs with 10000 nodes. Scaling the {MCL{\textasciitilde}algorithm} requires a regime of pruning the stochastic matrices which need to be computed. The effect of pruning on the quality of the retrieved clusterings is also investigated. [1] A cluster algorithm for graphs. Technical report {INS-R0010}, National Research Institute for Mathematics and Computer Science in the Netherlands, Amsterdam, 2000. [2] A stochastic uncoupling process for graphs. Technical report {INS-R0011}, National Research Institute for Mathematics and Computer Science in the Netherlands, Amsterdam, 2000.}, institution = {{CWI} {(Centre} for Mathematics and Computer Science)}, author = {Dongen, Stijn}, year = {2000}, keywords = {clustering, graphs} }, @inproceedings{tsantalis_identification_2009, title = {Identification of Extract Method Refactoring Opportunities}, url = {http://www2.computer.org/portal/web/csdl/doi/10.1109/CSMR.2009.23}, abstract = {Extract Method has been recognized as one of the most important refactorings, since it decomposes large methods and can be used in combination with other refactorings for fixing a variety of design problems. However, existing tools and methodologies support extraction of methods based on a set of statements selected by the user in the original method. The goal of the proposed methodology is to automatically identify Extract Method refactoring opportunities and present them as suggestions to the designer of an object-oriented system. The suggested refactorings adhere to three principles: the extracted code should contain the complete computation of a given variable declared in the original method, the behavior of the program should be preserved after the application of the refactoring, and the extracted code should not be excessively duplicated in the original method. The proposed approach is based on the union of static slices that result from the application of a block-based slicing technique. The soundness of the identified refactoring opportunities has been evaluated by an independent designer on the system that he developed.}, author = {Tsantalis, Nikolaos and Chatzigeorgiou, Alexander}, month = mar, year = {2009}, keywords = {refactoring, smells} }, @inproceedings{de_volder_jquery:_2006, address = {Charleston, South Carolina}, title = {{JQuery:} A Generic Code Browser with a Declarative Configuration Language}, abstract = {Modern {IDEs} have an open-ended plugin architecture to allow customizability. However, developing a plugin is costly in terms of effort and expertise required by the customizer. We present a two-pronged approach that allows for open-ended customizations while keeping the customization cost low. First, we explicitly limit the portion of the design space targeted by the configuration mechanism. This reduces customization cost by simplifying the configuration interface. Second, we use a declarative programming language as our configuration language. This facilitates open-ended specification of behavior without burdening the user with operational details.}, booktitle = {Proceedings of {PADL} 2006}, author = {De Volder, Kris}, month = jan, year = {2006}, keywords = {query language} }, @inproceedings{gorschek_large-scale_2010, address = {Cape Town, South Africa}, title = {A large-scale empirical study of practitioners' use of object-oriented concepts}, volume = {1}, isbn = {978-1-60558-719-6}, url = {http://portal.acm.org/citation.cfm?id=1806799.1806820}, doi = {10.1145/1806799.1806820}, abstract = {We present the first results from a survey carried out over the second quarter of 2009 examining how theories in object-oriented design are understood and used by software developers. We collected 3785 responses from software developers world-wide, which we believe is the largest survey of its kind. We targeted the use of encapsulation, class size as measured by number of methods, and depth of a class in the inheritance hierarchy. We found that, while overall practitioners followed advice on encapsulation, there was some variation of adherence to it. For class size and depth there was substantially less agreement with expert advice. In addition, inconsistencies were found within the use and perception of object-oriented concepts within the investigated group of developers. The results of this survey has deep reaching consequences for both practitioners and researchers as they highlight and confirm central issues.}, booktitle = {Proceedings of the 32nd {ACM/IEEE} International Conference on Software Engineering}, publisher = {{ACM}}, author = {Gorschek, Tony and Tempero, Ewan and Angelis, Lefteris}, year = {2010}, keywords = {empirical, inheritance, metrics, number of methods, survey}, pages = {115--124} }, @misc{walton_eclipse_????, title = {Eclipse Metrics Plugin}, location = {http://eclipse-metrics.sourceforge.net/}, url = {http://eclipse-metrics.sourceforge.net/}, abstract = {A plug in for Eclipse that calculates metrics for your code during build cycles and warns you, via the Problems View, of 'range violations' for each metric. You may also export the metrics to various formats for further analysis.}, publisher = {State of Flow}, author = {Walton, Lance and Walton, Channing}, keywords = {Eclipse, metrics} }, @article{wilde_maintaining_1993, title = {Maintaining object-oriented software}, volume = {10}, issn = {0740-7459}, abstract = {The maintenance requirements of object-oriented software, including the ability to make changes easily and an in-depth understanding of the software's structure and behavior, are discussed. The problems encountered by a maintainer trying to understand object-oriented software by reading and statically analyzing it are described. The problems caused by dynamic binding, polymorphism, and cooperating object classes in object-oriented software maintenance are reviewed}, number = {1}, journal = {Software, {IEEE}}, author = {Wilde, N. and Matthews, P. and Huitt, R.}, year = {1993}, keywords = {maintenance, {OOP}}, pages = {75--80} }, @inproceedings{daniel_automated_2007, address = {Dubrovnik, Croatia}, title = {Automated testing of refactoring engines}, isbn = {978-1-59593-811-4}, url = {http://portal.acm.org/citation.cfm?id=1287624.1287651}, doi = {10.1145/1287624.1287651}, abstract = {Refactorings are behavior-preserving program transformations that improve the design of a program. Refactoring engines are tools that automate the application of refactorings: first the user chooses a refactoring to apply, then the engine checks if the transformation is safe, and if so, transforms the program. Refactoring engines are a key component of modern {IDEs}, and programmers rely on them to perform refactorings. A bug in the refactoring engine can have severe consequences as it can erroneously change large bodies of source code.}, booktitle = {Proceedings of the the 6th joint meeting of the European software engineering conference and the {ACM} {SIGSOFT} symposium on The foundations of software engineering}, publisher = {{ACM}}, author = {Daniel, Brett and Dig, Danny and Garcia, Kely and Marinov, Darko}, year = {2007}, keywords = {automated testing, refactoring engines, test data generation}, pages = {185--194} }, @article{jung_measuring_2005, title = {Measuring software product quality: a survey of {ISO/IEC} 9126}, volume = {21}, issn = {0740-7459}, number = {5}, journal = {Software, {IEEE}}, author = {Jung, H. W and Kim, S. G and Chung, C. S}, year = {2005}, pages = {88{\textendash}92} }, @inproceedings{zhou_icbmc:_2002, title = {{ICBMC:} An Improved Cohesion Measure for Classes}, isbn = {0-7695-1819-2}, shorttitle = {{ICBMC}}, url = {http://portal.acm.org/citation.cfm?id=879764}, doi = {10.1109/ICSM.2002.1167746}, abstract = {Class cohesion could be used to evaluate the design quality of classes, to develop test measures for object-oriented software and to restructure poorly designed classes. Among a number of class cohesion measures proposed in the last decade, H. S. Chae's measure is based on the structure of the reference graph of a class, which overcomes the limitations of most class cohesion measures. However, it only considers the patterns of interactions among the members of a class partly and hence does not satisfy monotonicity, which might cause the measuring results inconsistent with intuition in some cases. This paper first analyzes the limitations of typical cohesion measures for classes in detail, and then proposes an improved cohesion measure {ICBMC.} Finally, this paper exemplifies the advantages and applications of {ICBMC.}}, booktitle = {Proceedings of the International Conference on Software Maintenance {(ICSM'02)}}, publisher = {{IEEE} Computer Society}, author = {Zhou, Yuming and Xu, Baowen and Zhao, Jianjun and Yang, Hongji}, year = {2002}, keywords = {cohesion, graphs, metrics, metrics validation, refactoring}, pages = {44--53}, annote = {{ICMBC} is an improvement on the graph-based {CMBC} cohesion metric. This paper also discusses how this metric can be used for restructuring classes. Uses the term "reference graph" for graphs showing the interdependencies of methods and attributes. Discusses drawbacks of some other metrics. Relevance: 5} }, @book{lakos_large-scale_1996, address = {Reading, Mass}, series = {{Addison-Wesley} professional computing series}, title = {{Large-Scale} C++ Software Design}, isbn = {0201633620}, lccn = {{QA76.73.C153} L342 1996}, publisher = {{Addison-Wesley} Pub. Co}, author = {Lakos, John}, year = {1996}, keywords = {metrics, {OOD}, {OOP}}, annote = {Includes the concept of levels - the length of the longest path from that component through the local component dependency graph to the possibly empty set of external or compiler-supplied components. Lakos is against cyclic dependencies. He also defines the Cumulative Component Dependency metric, {CCD}, which is the sum over all components Ci in a subsystem of the number of components needed in order to test each Ci incrementally. Relevance: 4} }, @article{basit_data_2009, title = {A Data Mining Approach for Detecting {Higher-Level} Clones in Software}, volume = {35}, issn = {0098-5589}, doi = {http://doi.ieeecomputersociety.org/10.1109/TSE.2009.16}, abstract = {Code clones are similar program structures recurring in variant forms in software system(s). Several techniques have been proposed to detect similar code fragments in software, so-called simple clones. Identification and subsequent unification of simple clones is beneficial in software maintenance. Even further gains can be obtained by elevating the level of code clone analysis. We observed that recurring patterns of simple clones often indicate the presence of interesting higher-level similarities that we call structural clones. Structural clones show a bigger picture of similarity situation than simple clones alone. Being logical groups of simple clones, structural clones alleviate the problem of huge number of clones typically reported by simple clone detection tools, a problem that is often dealt with postdetection visualization techniques. Detection of structural clones can help in understanding the design of the system for better maintenance and in reengineering for reuse, among other uses. In this paper, we propose a technique to detect some useful types of structural clones. The novelty of our approach includes the formulation of the structural clone concept and the application of data mining techniques to detect these higher-level similarities. We describe a tool called Clone Miner that implements our proposed technique. We assess the usefulness and scalability of the proposed techniques via several case studies. We discuss various usage scenarios to demonstrate in what ways the knowledge of structural clones adds value to the analysis based on simple clones alone.}, number = {4}, journal = {{IEEE} Transactions on Software Engineering}, author = {Basit, Hamid Abdul and Jarzabek, Stan}, year = {2009}, keywords = {design concepts, maintainability, reengineering, restructuring, reverse engineering}, pages = {497--514}, annote = {Complete {PDF} document was either not available or accessible. Please make sure you're logged in to the digital library to retrieve the complete {PDF} document.} }, @article{mens_survey_2004, title = {A survey of software refactoring}, volume = {30}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=1265817}, abstract = {We provide an extensive overview of existing research in the field of software refactoring. This research is compared and discussed based on a number of different criteria: the refactoring activities that are supported, the specific techniques and formalisms that are used for supporting these activities, the types of software artifacts that are being refactored, the important issues that need to be taken into account when building refactoring tool support, and the effect of refactoring on the software process. A running example is used to explain and illustrate the main concepts.}, number = {2}, journal = {{IEEE} Transactions on Software Engineering}, author = {Mens, T and Tourwe, T}, year = {2004}, keywords = {refactoring, survey}, pages = {139, 126} }, @phdthesis{fokaefs_identification_2010, address = {Edmonton, Alberta}, type = {Master's thesis}, title = {Identification and Application of Extract Class Refactorings in {Object-Oriented} Systems}, abstract = {Software can be considered a live entity, as it undergoes many alterations throughout its lifecycle. Therefore, code can become rather complex and difficult to understand. More specifically in object-oriented systems, classes may become very large and less cohesive. In order to identify such problematic cases, existing approaches have proposed the use of cohesion metrics. While metrics can identify classes with low cohesion, they usually cannot identify new or independent concepts. In this work, we propose a class decomposition method using an clustering algorithm based on the Jaccard distance between class members. The methodology is able to identify new concepts and rank the solutions according to their impact on the design quality of the system. The methodology was evaluated in terms of assessment by designers, expert assessment and metrics. The evaluation showed the ability of the method to identify new recognizable concepts and improve the design quality of the underlying system.}, school = {Dept. of Computing Science, University of Alberta}, author = {Fokaefs, Marios}, year = {2010}, keywords = {clustering, extract class, refactoring, software clustering}, annote = {Simple agglomerative clustering using a Jaccard distance. It's unclear exactly what is going into the property sets. After clustering, suggestions are ranked using the entity placement metric. Relevance: 5} }, @inproceedings{joshi_concept_2009, address = {Los Alamitos, {CA}, {USA}}, title = {Concept Analysis for Class Cohesion}, doi = {http://doi.ieeecomputersociety.org/10.1109/CSMR.2009.54}, abstract = {A concept lattice based approach for analysis of class cohesion is presented. The approach facilitates rapid identification of less cohesive classes. It also helps identify less cohesive methods, attributes and classes in one go. Further, the approach guides refactorings such as extract class, move method, localize attributes and remove unused {attributes.The} effectiveness of the technique is demonstrated through examples.}, booktitle = {Software Maintenance and Reengineering, European Conference on}, publisher = {{IEEE} Computer Society}, author = {Joshi, Padmaja and Joshi, Rushikesh K.}, year = {2009}, keywords = {cohesion analysis, concept analysis, refactoring}, pages = {237--240}, annote = {Complete {PDF} document was either not available or accessible. Please make sure you're logged in to the digital library to retrieve the complete {PDF} document.} }, @article{wu_finding_2004, title = {Finding communities in linear time: a physics approach}, volume = {38}, number = {2}, journal = {The European Physical Journal {B-Condensed} Matter and Complex Systems}, author = {Wu, F. and Huberman, B. A.}, year = {2004}, keywords = {clustering, graph algorithms}, pages = {331{\textendash}338} }, @article{concas_empirical_2010, title = {An empirical study of social networks metrics in object-oriented software}, volume = {2010}, issn = {1687-8655}, url = {http://dx.doi.org/10.1155/2010/729826}, doi = {http://dx.doi.org/10.1155/2010/729826}, abstract = {We study the application to object-oriented software of new metrics, derived from Social Network Analysis. Social Networks metrics, as for instance, the {EGO} metrics, allow to identify the role of each single node in the information flow through the network, being related to software modules and their dependencies. These metrics are compared with other traditional software metrics, like the {Chidamber-Kemerer} suite, and software graph metrics. We examine the empirical distributions of all the metrics, bugs included, across the software modules of several releases of two large Java systems, Eclipse and Netbeans. We provide analytical distribution functions suitable for describing and studying the observed distributions. We study also correlations among metrics and bugs. We found that the empirical distributions systematically show fat-tails for all the metrics. Moreover, the various metric distributions look very similar and consistent across all system releases and are also very similar in both the studied systems. These features appear to be typical properties of these software metrics.}, journal = {Advances in Software Engineering}, author = {Concas, Giulio and Marchesi, Michele and Murgia, Alessandro and Tonelli, Roberto}, month = jan, year = {2010}, note = {{ACM} {ID:} 1972593}, pages = {4:1{\textendash}4:21} }, @inproceedings{elish_investigating_2009, address = {Los Alamitos, {CA}, {USA}}, title = {Investigating the Effect of Refactoring on Software Testing Effort}, doi = {http://doi.ieeecomputersociety.org/10.1109/APSEC.2009.14}, abstract = {Refactoring, the process of improving the design of existing code by changing its internal structure without affecting its external behavior, tends to improve software quality by improving design, improving readability, and reducing bugs. There are many different refactoring methods, each having a particular purpose and effect. Consequently, the effect of refactorings on software quality attribute may vary. Software testing is an external software quality attributes that takes lots of time and effort to make sure that the software performs as intended. In this paper, we propose a classification of refactoring methods based on their measurable effect on software testing effort. This, in turn, helps the software developers decide which refactoring methods to apply in order to optimize a software system with regard to the testing effort.}, booktitle = {{Asia-Pacific} Software Engineering Conference}, publisher = {{IEEE} Computer Society}, author = {Elish, Karim O. and Alshayeb, Mohammad}, year = {2009}, keywords = {refactoring, testing effort}, pages = {29--34}, annote = {{"{\textquotedblleft}Extract} Class{\textquotedblright} reduces the testing effort" } }, @inproceedings{schmolze_classification_1983, address = {Karlsruhe, West Germany}, title = {Classification in the {KL-ONE} Knowledge Representation System, {1JCAI'83}}, booktitle = {Proceedings Int. Joint Conf. on Artificial Intelligence}, publisher = {Aug}, author = {Schmolze, J. G. and Lipkis, T. A.}, year = {1983}, keywords = {kbs, ontologies, pattern matching, semantic networks}, pages = {330--332}, annote = {One of the earliest papers about automatic classification of concepts in a semantic network. Relevance: 3} }, @article{hartigan_algorithm_1979, title = {Algorithm {AS} 136: A {K-Means} Clustering Algorithm}, volume = {28}, issn = {0035-9254}, shorttitle = {Algorithm {AS} 136}, url = {http://www.jstor.org/stable/2346830}, doi = {10.2307/2346830}, number = {1}, journal = {Journal of the Royal Statistical Society. Series C {(Applied} Statistics)}, author = {Hartigan, J. A. and Wong, M. A.}, month = jan, year = {1979}, pages = {100--108} }, @article{kaur_exploring_2010, title = {Exploring Design Level Class Cohesion Metrics}, volume = {03}, issn = {1945-3116}, url = {http://www.thefreelibrary.com/Exploring+design+level+class+cohesion+metrics.-a0227197051}, doi = {10.4236/jsea.2010.34043}, number = {04}, journal = {Journal of Software Engineering and Applications}, author = {Kaur, Kuljit}, year = {2010}, keywords = {cohesion, metrics, {OOD}, survey}, pages = {384--390} }, @incollection{brandes_graphml_2005-1, title = {{GraphML} Transformation}, url = {http://www.springerlink.com/content/e8hp0qpf2yaq8q63}, abstract = {The efforts put into {XML-related} technologies have exciting consequences for {XML-based} graph data formats such as {GraphML.} We here give a systematic overview of the possibilities offered by {XSLT} style sheets for processing graph data, and illustrate that many basic tasks required for tools used in graph drawing can be implemented by means of style sheets, which are convenient to use, portable, and easy to customize.}, booktitle = {Graph Drawing}, author = {Brandes, Ulrik and Pich, Christian}, year = {2005}, keywords = {graphs}, pages = {89--99} }, @inproceedings{sensalire_classifying_2008, address = {Ammersee, Germany}, title = {Classifying desirable features of software visualization tools for corrective maintenance}, isbn = {978-1-60558-112-5}, url = {http://portal.acm.org/citation.cfm?id=1409720.1409734&coll=portal&dl=ACM&type=series&idx=SERIES10774&part=series&WantType=Proceedings&title=SV&CFID=86337823&CFTOKEN=19567468}, doi = {10.1145/1409720.1409734}, abstract = {We provide an evaluation of 15 software visualization tools applicable to corrective maintenance. The tasks supported as well as the techniques used are presented and graded based on the support level. By analyzing user acceptation of current tools, we aim to help developers to select what to consider, avoid or improve in their next releases. Tool users can also recognize what to broadly expect (and what not) from such tools, thereby supporting an informed choice for the tools evaluated here and for similar tools.}, booktitle = {Proceedings of the 4th {ACM} symposium on Software visualization}, publisher = {{ACM}}, author = {Sensalire, Mariam and Ogao, Patrick and Telea, Alexandru}, year = {2008}, keywords = {visualization}, pages = {87--90} }, @inproceedings{ducasse_language_1999, title = {A Language Independent Approach for Detecting Duplicated Code}, isbn = {0-7695-0016-1}, url = {http://portal.acm.org/citation.cfm?id=853389}, abstract = {Code duplication is one of the factors that severely complicates the maintenance and evolution of large software systems. Techniques for detecting duplicated code exist but rely mostly on parsers, technology that has proven to be brittle in the face of different languages and dialects. In this paper we show that is possible to circumvent this hindrance by applying a language independent and visual approach, i.e. a tool that requires no parsing, yet is able to detect a significant amount of code duplication. We validate our approach on a number of case studies, involving four different implementation languages and ranging from 256 K up to {13Mb} of source code size.}, booktitle = {Proceedings of the {IEEE} International Conference on Software Maintenance}, publisher = {{IEEE} Computer Society}, author = {Ducasse, St\'{e}phane and Rieger, Matthias and Demeyer, Serge}, year = {1999}, keywords = {clones, maintenance, visualization}, pages = {109} }, @inproceedings{munro_product_2005, title = {Product metrics for automatic identification of "bad smell" design problems in Java source-code}, isbn = {1530-1435}, abstract = {Refactoring can have a direct influence on reducing the cost of software maintenance through changing the internal structure of the source-code to improve the overall design that helps the present and future programmers evolve and understand a system. Bad smells are a set of design problems with refactoring identified as a solution. Locating these bad smells has been described as more a human intuition than an exact science. This paper addresses the issue of identifying the characteristics of a bad smell through the use of a set of software metrics. Then by using a pre-defined set of interpretation rules to interpret the software metric results applied to Java source-code, the software engineer can be provided with significant guidance as to the location of bad smells. These issues are addressed in a number of ways. Firstly, a precise definition of bad smells is given from the informal descriptions given by the originators Fowler and Beck. The characteristics of the bad smells have been used to define a set of measurements and interpretation rules for a subset of the bad smells. A prototype tool has been implemented to enable the evaluation of the interpretation rules in two case studies.}, booktitle = {Proceedings of the 11th {IEEE} International Symposium on Software Metrics}, author = {Munro, {M.J.}}, year = {2005}, keywords = {metrics, refactoring, size, smells}, pages = {15}, annote = {A fairly lightweight paper about automatically detecting lazy classes and temporary fields. Relevance: 3} }, @inproceedings{tzerpos_mojo:_1999, title = {{MoJo:} a distance metric for software clusterings}, shorttitle = {{MoJo}}, abstract = {The software clustering problem has attracted much attention recently, since it is an integral part of the process of reverse engineering large software systems. A key problem in this research is the difficulty in comparing different approaches in an objective fashion. In this paper, we present a metric, called {MoJo} {(Move-Join)}, that can be used in evaluating the similarity of two different decompositions of a software system. Our metric calculates a distance between two partitions of the same set of software resources. We begin by introducing the model we use. Then we present a heuristic algorithm that calculates the distance in an efficient fashion. Finally, we discuss some experiments that showcase the performance of the algorithm and the effectiveness of the metric}, booktitle = {Proceedings Sixth Working Conference on Reverse Engineering, 1999}, author = {Tzerpos, V. and Holt, {R.C.}}, year = {1999}, keywords = {cluster evaluation, clustering, distance metric, heuristic programming, metrics, reverse engineering, software clustering}, pages = {187--193} }, @article{brin_anatomy_1998, title = {The anatomy of a large-scale hypertextual Web search engine* 1}, volume = {30}, number = {1-7}, journal = {Computer networks and {ISDN} systems}, author = {Brin, S. and Page, L.}, year = {1998}, keywords = {search, world wide web}, pages = {107{\textendash}117} }, @inproceedings{hitz_measuring_1995, address = {Monterrey, Mexico}, title = {Measuring coupling and cohesion in object-oriented systems}, abstract = {As the role that software metrics in general and coupling in particular play with respect to maintainability of software products is widely accepted, current approaches to handle coupling and / or cohesion in objectoriented systems are evaluated. Having identified some inadequacies, we provide a comprehensive framework to deal with all sorts of coupling. This framework takes into account the distinction between object level - and class level coupling. This distinction refers to dynamic dependencies between objects on one hand and static dependencies between implementations on the other hand, representing important aspects of software quality at run-time and during the maintenance phase, respectively. As far as cohesion is concerned, we analyze a well known metric put forward by Chidamber and Kemerer and re-stated by Li and Henry. As a result, we present a graph theoretic improved version of this metric.}, booktitle = {Proc. Int. Symposium on Applied Corporate Computing}, author = {Hitz, Martin and Montazeri, Behzad}, year = {1995}, keywords = {cohesion, coupling, metrics}, annote = {Graph based cohesion. Mostly about coupling. Improves on previous cohesion metrics by considering indirect accesses of attributes and accessor methods. Relevance: 4} }, @article{batagelj_pajek_2008, title = {Pajek - Program for Analysis and Visualization of Large Networks Reference Manual}, url = {vlado.fmf.uni-lj.si/Pub/Networks/Pajek/Doc/PajekMan.pdf}, author = {Batagelj, Vladimir and Mrvar, Andrej}, month = dec, year = {2008}, keywords = {graphs, visualization} }, @misc{sauer_eclipse_2010, title = {Eclipse Metrics plugin continued}, url = {http://metrics2.sourceforge.net/}, author = {Sauer, Frank and Boissier, Guillaume}, year = {2010}, note = {Accessed 2010-04-22}, keywords = {Eclipse, metrics}, howpublished = {http://metrics2.sourceforge.net/} }, @inproceedings{malayeri_is_2009, address = {Berlin, Heidelberg}, series = {{ESOP} '09}, title = {Is Structural Subtyping Useful? An Empirical Study}, isbn = {978-3-642-00589-3}, location = {York, {UK}}, shorttitle = {Is Structural Subtyping Useful?}, url = {http://dx.doi.org/10.1007/978-3-642-00590-9_8}, doi = {http://dx.doi.org/10.1007/978-3-642-00590-9_8}, abstract = {Structural subtyping is popular in research languages, but all mainstream object-oriented languages use nominal subtyping. Since languages with structural subtyping are not in widespread use, the empirical questions of whether and how structural subtyping is useful have thus far remained unanswered. This study aims to provide answers to these questions. We identified several criteria that are indicators that nominally typed programs could benefit from structural subtyping, and performed automated and manual analyses of open-source Java programs based on these criteria. Our results suggest that these programs could indeed be improved with the addition of structural subtyping. We hope this study will provide guidance for language designers who are considering use of this subtyping discipline.}, booktitle = {Proceedings of the 18th European Symposium on Programming Languages and Systems: Held as Part of the Joint European Conferences on Theory and Practice of Software, {ETAPS} 2009}, publisher = {{Springer-Verlag}}, author = {Malayeri, Donna and Aldrich, Jonathan}, year = {2009}, note = {{ACM} {ID:} 1532986}, pages = {95{\textendash}111} }, @inproceedings{cassell_visualizing_2011, address = {Perth, Australia}, series = {{CRPIT}}, title = {Visualizing the Refactoring of Classes via Clustering}, volume = {113}, abstract = {When developing object-oriented classes, it is difficult to determine how to best reallocate the members of large, complex classes to create smaller, more cohesive ones. Clustering techniques can provide guidance on how to solve this allocation problem; however, inappropriate use of clustering can result in a class structure that is less maintainable than the original. The {ExtC} Visualizer helps the programmer understand the class structure by visually emphasizing important features of the class's members and their inter-relationships. More importantly, it helps users see how various clustering algorithms group the class's members. These insights help a programmer choose appropriate techniques for refactoring large classes.}, booktitle = {Proc. Australasian Computer Science Conference {(ACSC} 2011)}, publisher = {{ACS}}, author = {Cassell, Keith and Anslow, Craig and Groves, Lindsay and Andreae, Peter}, month = jan, year = {2011}, keywords = {clustering, extract class, refactoring, software clustering, visualization}, pages = {63--72} }, @article{kitchenham_towards_1995, title = {Towards a Framework for Software Measurement Validation}, volume = {21}, url = {http://portal.acm.org/citation.cfm?id=631201}, abstract = {In this paper we propose a framework for validating software measurement. We start by defining a measurement structure model that identifies the elementary component of measures and the measurement process, and then consider five other models involved in measurement: unit definition models, instrumentation models, attribute relationship models, measurement protocols and entity population models. We consider a number of measures from the viewpoint of our measurement validation framework and identify a number of shortcomings; in particular we identify a number of problems with the construction of function points. We also compare our view of measurement validation with ideas presented by other researchers and identify a number of areas of disagreement. Finally, we suggest several rules that practitioners and researchers can use to avoid measurement problems, including the use of measurement vectors rather than artificially contrived scalars.}, number = {12}, journal = {{IEEE} Trans. Softw. Eng.}, author = {Kitchenham, Barbara and Pfleeger, Shari Lawrence and Fenton, Norman}, year = {1995}, keywords = {measurement theory, metrics, metrics validation}, pages = {929--944} }, @inproceedings{maekelae_observations_2006, address = {University of Veliko Tarnovo, Bulgaria}, title = {Observations on Lack of Cohesion Metrics}, abstract = {Lack of Cohesion Metric {(LCOM)} is perhaps the most used metric when trying to measure the goodness of a class written in some object-oriented language. We apply the basic {LCOM} metric to the Java {SDK} 5.0 and Eclipse libraries. {LCOM} gives a bad value for most of the classes. We study the reasons and characterize sets of classes for which one should not apply {LCOM} to determine the goodness of an implementation. Yet, one of the major reasons for bad {LCOM} value is observed to be its dependence on the measured class size. Normalization in this respect is considered}, booktitle = {International Conference on Computer Systems and Technologies}, author = {M\"{a}kel\"{a}, Sami and Lepp\"{a}nen, Ville}, month = jun, year = {2006}, keywords = {cohesion, empirical, metrics}, annote = {Analyses where {LCOM} falls short, including discussion of reflection, inner classes. } }, @article{subramanyam_empirical_2003, title = {Empirical analysis of {CK} metrics for object-oriented design complexity: implications for software defects}, volume = {29}, issn = {0098-5589}, shorttitle = {Empirical analysis of {CK} metrics for object-oriented design complexity}, doi = {10.1109/TSE.2003.1191795}, abstract = {To produce high quality object-oriented {(OO)} applications, a strong emphasis on design aspects, especially during the early phases of software development, is necessary. Design metrics play an important role in helping developers understand design aspects of software and, hence, improve software quality and developer productivity. In this paper, we provide empirical evidence supporting the role of {OO} design complexity metrics, specifically a subset of the Chidamber and Kemerer (1991, 1994) suite {(CK} metrics), in determining software defects. Our results, based on industry data from software developed in two popular programming languages used in {OO} development, indicate that, even after controlling for the size of the software, these metrics are significantly associated with defects. In addition, we find that the effects of these metrics on defects vary across the samples from two programming {languages-C++} and Java. We believe that these results have significant implications for designing high-quality software products using the {OO} approach.}, number = {4}, journal = {{IEEE} Transactions on Software Engineering}, author = {Subramanyam, R. and Krishnan, {M.S.}}, year = {2003}, keywords = {empirical, metrics, {OOD}, {OOP}}, pages = {297--310}, annote = {Has a nice table summarizing other empirical research on {OO} metrics.} }, @inproceedings{koschke_framework_2000, title = {A framework for experimental evaluation of clustering techniques}, isbn = {0769506569}, abstract = {Experimental evaluation of clustering techniques for component recovery is necessary in order to analyze their strengths and weaknesses in comparison to other techniques. For comparable evaluations of automatic clustering techniques, a common reference corpus of freely available systems is needed for which the actual components are known. The reference corpus is used to measure recall and precision of automatic techniques. For this measurement, a standard scheme for comparing the components recovered by a clustering technique to components in the reference corpus is required. This paper describes both the process of setting up reference corpora and ways of measuring recall and precision of automatic clustering techniques. For methods with human intervention, controlled experiments should be conducted. This paper additionally proposes a controlled experiment as a standard for evaluating manual and semi-automatic component recovery methods that can be conducted cost-effectively}, booktitle = {Proceedings 8th International Workshop on Program Comprehension}, author = {Koschke, R. and Eisenbarth, T.}, year = {2000}, keywords = {cluster evaluation, clustering, software clustering, testing}, pages = {201--210} }, @inproceedings{stroggylos_refactoring--does_2007, title = {{Refactoring--Does} It Improve Software Quality?}, url = {http://dx.doi.org/10.1109/WOSQ.2007.11}, abstract = {Software systems undergo modifications, improvements and enhancements to cope with evolving requirements. This maintenance can cause their quality to decrease. Various metrics can be used to evaluate the way the quality is affected. Refactoring is one of the most important and commonly used techniques of transforming a piece of software in order to improve its quality. However, although it would be expected that the increase in quality achieved via refactoring is reflected in the various metrics, measurements on real life systems indicate the opposite. We analyzed source code version control system logs of popular open source software systems to detect changes marked as refactorings and examine how the software metrics are affected by this process, in order to evaluate whether refactoring is effectively used as a means to improve software quality within the open source community.}, booktitle = {Software Quality, 2007. {WoSQ'07:} {ICSE} Workshops 2007. Fifth International Workshop on}, author = {Stroggylos, K and Spinellis, D}, year = {2007}, keywords = {refactoring}, pages = {10, 10} }, @inproceedings{van_kempen_towards_2005, address = {Republic of South Africa}, series = {{SAICSIT} '05}, title = {Towards proving preservation of behaviour of refactoring of {UML} models}, isbn = {1-59593-258-5}, location = {White River, South Africa}, url = {http://dl.acm.org/citation.cfm?id=1145675.1145703}, booktitle = {Proceedings of the 2005 annual research conference of the South African institute of computer scientists and information technologists on {IT} research in developing countries}, publisher = {South African Institute for Computer Scientists and Information Technologists}, author = {van Kempen, Marc and Chaudron, Michel and Kourie, Derrick and Boake, Andrew}, year = {2005}, keywords = {refactoring, restructuring, statechart, {UML}}, pages = {252{\textendash}259} }, @article{maqbool_hierarchical_2007, title = {Hierarchical Clustering for Software Architecture Recovery}, volume = {33}, url = {http://portal.acm.org/citation.cfm?id=1314083}, abstract = {{Abstract-Gaining} an architectural level understanding of a software system is important for many reasons. When the description of a system's architecture does not exist, attempts must be made to recover it. In recent years, researchers have explored the use of clustering for recovering a software system's architecture, given only its source code. The main contributions of this paper are as follows. First, we review hierarchical clustering research in the context of software architecture recovery and modularization. Second, to employ clustering meaningfully, it is necessary to understand the peculiarities of the software domain, and the behavior of clustering measures and algorithms in this domain. To this end, we provide a detailed analysis of the behavior of various similarity and distance measures that may be employed for software clustering. Thirdly, we analyze the clustering process of various well-known clustering algorithms using multiple criteria, and show how arbitrary decisions taken by these algorithms during clustering affect the quality of their results. Finally, we present an analysis of two recently proposed clustering algorithms, revealing close similarities in their apparently different clustering approaches. Experiments on four legacy software systems provide insight into the behavior of well-known clustering algorithms, and their characteristics in the software domain.}, number = {11}, journal = {{IEEE} Trans. Softw. Eng.}, author = {Maqbool, Onaiza and Babri, Haroon}, year = {2007}, keywords = {architecture recovery, clustering, hierarchical clustering, restructuring, reverse engineering, software clustering}, pages = {759--780} }, @inproceedings{noack_space_2005, address = {St. Louis, Missouri}, title = {A space of layout styles for hierarchical graph models of software systems}, url = {http://portal.acm.org/beta/citation.cfm?id=1056018.1056040}, doi = {10.1145/1056018.1056040}, abstract = {Hierarchical graphs are widely used as models of the structure of software systems. A central problem in the visualization of hierarchical graphs is the computation of layouts, i.e. of positions of the nodes in two- or three-dimensional space. We derive requirements for graph layouts from various software analysis questions, and classify the required layouts along three dimensions: layouts with meaningful distances between single nodes vs. layouts with meaningful distances between groups of nodes, layouts reflecting adjacency vs. layouts reflecting hierarchy, and layouts that faithfully reflect the size of subgraphs vs. layouts where certain subgraphs are magnified. We present a fairly simple and theoretically validated energy model for computing such layouts.}, booktitle = {Proceedings of the 2005 {ACM} symposium on Software visualization - {SoftVis} '05}, author = {Noack, Andreas and Lewerentz, Claus}, year = {2005}, keywords = {graph algorithms, graph layout, graphs}, pages = {155} }, @misc{cassell_refactoring_2010, title = {refactoring : Message: Suites for testing automated refactoring tools}, url = {http://tech.groups.yahoo.com/group/refactoring/message/10396}, journal = {refactoring - The Refactoring Group}, author = {Cassell, Keith}, month = may, year = {2010}, howpublished = {http://tech.groups.yahoo.com/group/refactoring/message/10396} }, @inproceedings{opdyke_refactoring:_1990, title = {Refactoring: An aid in designing application frameworks and evolving object-oriented systems}, booktitle = {Proceedings of Symposium on {Object-Oriented} Programming Emphasizing Practical Applications}, author = {Opdyke, William and Johnson, Ralph}, year = {1990}, keywords = {refactoring} }, @article{tsantalis_identification_2009-1, title = {Identification of Move Method Refactoring Opportunities}, volume = {35}, url = {http://www2.computer.org/portal/web/csdl/doi/10.1109/TSE.2009.1}, abstract = {Placement of attributes/methods within classes in an object-oriented system is usually guided by conceptual criteria and aided by appropriate metrics. Moving state and behavior between classes can help reduce coupling and increase cohesion, but it is nontrivial to identify where such refactorings should be applied. In this paper, we propose a methodology for the identification of Move Method refactoring opportunities that constitute a way for solving many common Feature Envy bad smells. An algorithm that employs the notion of distance between system entities (attributes/methods) and classes extracts a list of behavior-preserving refactorings based on the examination of a set of preconditions. In practice, a software system may exhibit such problems in many different places. Therefore, our approach measures the effect of all refactoring suggestions based on a novel Entity Placement metric that quantifies how well entities have been placed in system classes. The proposed methodology can be regarded as a semi-automatic approach since the designer will eventually decide whether a suggested refactoring should be applied or not based on conceptual or other design quality criteria. The evaluation of the proposed approach has been performed considering qualitative, metric, conceptual, and efficiency aspects of the suggested refactorings in a number of open-source projects.}, number = {3}, journal = {{IEEE} Transactions on Software Engineering}, author = {Tsantalis, Nikolaos and Chatzigeorgiou, Alexander}, month = jan, year = {2009}, keywords = {move method, refactoring, smells}, pages = {347--367} }, @inproceedings{marx_computer-aided_2010, title = {{Computer-Aided} Extraction of Software Components}, isbn = {1095-1350}, doi = {10.1109/WCRE.2010.28}, abstract = {In a software project, outsourcing the development of a particular functionality, reusing a part in another software, or handing-over a part of the code to a new team member requires the extraction of an independent subset of the software-a component. This paper describes and analyzes the process of extracting such a component. We introduce an automated approach based on optimizing the cut between the new component and the remaining system. A visual development tool implements our approach and interactively supports the extraction. Finally, we look at the results of a thinking aloud user study and discuss the lessons learned about the extraction tool as well as the extraction process.}, booktitle = {Reverse Engineering {(WCRE)}, 2010 17th Working Conference on}, author = {Marx, Andreas and Beck, Fabian and Diehl, Stephan}, year = {2010}, keywords = {outsourcing, software development, software management, software project, software reusability}, pages = {183--192} }, @inproceedings{jemerov_implementing_2008, address = {Nashville, Tennessee, {USA}}, title = {Implementing Refactorings in {IntelliJ} {IDEA}}, abstract = {{IntelliJ} {IDEA} was one of the first Java {IDEs} to cross the Refactoring Rubicon [1], by implementing the Extract Method refactoring for Java in early 2001. Since that time, {IntelliJ} {IDEA} has evolved greatly to support a wide array of refactorings for Java, cross-language refactoring and other advanced features. This paper gives an overview of the key architectural components of {IntelliJ} {IDEA} involved in implementing refactorings. It also describes some of the problems we're facing when implementing refactorings and possible directions for future development.}, booktitle = {Proceedings of the Second {ACM} Workshop on Refactoring Tools}, author = {Jemerov, Dmitry}, month = oct, year = {2008}, keywords = {refactoring}, pages = {13:1--13:2}, annote = {A two page paper on some aspects of {IntelliJ's} refactoring capabilities. Includes the notion of a separate pattern matching specification language to help define custom refactorings. Relevance: 3 } }, @book{kerievsky_refactoring_2005, title = {Refactoring to Patterns}, isbn = {0321213351}, publisher = {{Addison-Wesley}}, author = {Kerievsky, Joshua}, year = {2005}, keywords = {refactoring}, annote = {Discusses when code should be refactored to use a design pattern, including indicators in the original code. Relevance: 5 }, annote = {Pattern catalog is at http://www.industriallogic.com/xp/refactoring/catalog.html} }, @incollection{koschke_identifying_2008, title = {Identifying and Removing Software Clones}, url = {http://dx.doi.org/10.1007/978-3-540-76440-3_2}, abstract = {Ad-hoc reuse through copy-and-paste occurs frequently in practice affecting the evolvability of software. Researchers have investigated ways to locate and remove duplicated code. Empirical studies have explored the root causes and effects of duplicated code and the evolution of duplicated code. This chapter summarizes the state of the art in detecting, managing, and removing software redundancy. It describes consequences, pros and cons of copying and pasting code.}, booktitle = {Software Evolution}, author = {Koschke, Rainer}, year = {2008}, keywords = {clones, maintenance, refactoring, transformation}, pages = {15--36} }, @misc{cassell_object-oriented_2011, title = {Object-oriented cohesion metrics}, url = {http://homepages.ecs.vuw.ac.nz/~kcassell/cohesionMetricList.html}, author = {Cassell, Keith}, year = {2011}, note = {Accessed 2011-03-21}, howpublished = {{http://homepages.ecs.vuw.ac.nz/{\textasciitilde}kcassell/cohesionMetricList.html}} }, @misc{chire_clusteranalysis_2010, title = {{ClusterAnalysis} Mouse.svg}, copyright = {Permission to use as public domain is granted by {http://en.wikipedia.org/wiki/File:ClusterAnalysis\_Mouse.svg}}, url = {http://upload.wikimedia.org/wikipedia/commons/thumb/0/09/ClusterAnalysis_Mouse.svg/2000px-ClusterAnalysis_Mouse.svg.png}, author = {Chire}, month = oct, year = {2010}, note = {Accessed 17 November 2011}, howpublished = {{http://upload.wikimedia.org/wikipedia/commons/thumb/0/09/ClusterAnalysis\_Mouse.svg/2000px-ClusterAnalysis\_Mouse.svg.png}} }, @inproceedings{biggers_toward_2011, title = {Toward a metrics suite for source code lexicons}, isbn = {978-1-4577-0663-9}, doi = {10.1109/ICSM.2011.6080816}, abstract = {In this paper we present an empirical study of relationships between three source code lexicons: the identifier, comment, and literal lexicons. We conjecture that shared and unique properties of these lexicons for the given subject system can inform the configuration of a source code retrieval technique for a particular software understanding activity or software evolution task. Thus, we seek to discover these lexicon properties, and so we investigate five lexicon measures that consider term frequency, term density, and term provenance.}, booktitle = {2011 27th {IEEE} International Conference on Software Maintenance {(ICSM)}}, publisher = {{IEEE}}, author = {Biggers, Lauren R and Eddy, Brian P and Kraft, Nicholas A and Etzkorn, Letha H}, month = sep, year = {2011}, keywords = {correlation, Density measurement, Indexes, software lexicon, Software measurement, software metrics, Software systems, text retrieval}, pages = {492--495} }, @inproceedings{anquetil_experiments_1999, title = {Experiments with Clustering as a Software Remodularization Method}, isbn = {0-7695-0303-9}, url = {http://portal.acm.org/citation.cfm?id=837051}, abstract = {As valuable software systems become older, reverse engineering becomes increasingly important to companies that have to maintain the code. Clustering is a key activity in reverse engineering that is used to discover improved designs of systems or to extract significant concepts from {code.Clustering} is an old, highly sophisticated, activity which offers many methods to meet different needs. The various methods have been well documented in the past, however conclusions from the general clustering literature may not apply entirely in the reverse engineering domain. In this paper, we study three things: some clustering algorithms, some metrics that quantify the coupling between entities to be clustered, and how these entities are represented abstractly. Our objective is to establish whether and why each could be used for software {remodularization.The} results are compared using three quality criteria: design quality (cohesion and coupling), comparison with an expert decomposition, and size of the clusters obtained. The experiments were conducted on three public domain systems (gcc, Linux and Mosaic) and a legacy telecommunications {system.Among} our findings, we confirm the importance of carefully choosing the scheme that is used to describe the entities being clustered.}, booktitle = {Proceedings of the Sixth Working Conference on Reverse Engineering}, publisher = {{IEEE} Computer Society}, author = {Anquetil, Nicolas and Fourrier, C\'{e}dric and Lethbridge, Timothy C.}, year = {1999}, keywords = {clustering, software clustering, software modules}, pages = {235}, annote = {Evaluates agglomerative clustering as applied to software remodularization of file. Some key points: - importance of good choices of descriptions, "coupling metrics"/sim fns, clustering algo - formal features based on code relationships vs. informal rels, e.g. "semantics" of identifiers - proposes four broad categories of "similarity metrics" - proposes that cohesion is more important than coupling, so complete link is better than single link - notes problems with measuring the quality of the result using the same method that generated the result - likes Jaccard and {Sorenson-Dice} for measuring similarity, especially Jaccard, because of its simplicity} }, @book{iso_-_international_organization_for_standardization_iso/iec_2001, address = {Geneva, Switzerland}, title = {{ISO/IEC} 9126-1:2001 - Software engineering -- Product quality -- Part 1: Quality model}, copyright = {Copyright {ISO} - International Organization for Standardization. All rights reserved.}, url = {http://www.iso.org/iso/iso_catalogue/catalogue_tc/catalogue_detail.htm?csnumber=22749}, publisher = {{IEEE}}, author = {{ISO} - International Organization for Standardization}, year = {2001}, keywords = {maintainability}, annote = {http://www.sqa.net/iso9126.html has a good (and free) overview.} }, @inproceedings{tempero_empirical_2008, title = {An Empirical Study of Unused Design Decisions in Open Source Java Software}, isbn = {1530-1362}, doi = {10.1109/APSEC.2008.30}, abstract = {{{\textless}p{\textgreater}A} recent study on how inheritance is used in open source Java software revealed a surprising number of interfaces that were neither implemented nor extended. While innocent explanations for this exist (the interfaces are part of frameworks that only clients of the frameworks implement), it does raise the question of how much "dead code'' exists in applications. Dead code usually refers to code within a function that cannot be executed, but unused interfaces, and more generally unused public methods, represent dead code at the "design'' level, and so can potentially have a significant impact on future maintenance costs. This paper presents a large empirical study on existence of design decisions that are unused. This study examined 100 open source Java applications. The results show a significant level of unused design decisions.{\textless}/p{\textgreater}}, booktitle = {Software Engineering Conference, 2008. {APSEC} '08. 15th {Asia-Pacific}}, author = {Tempero, E.}, year = {2008}, keywords = {dead code, interfaces, maintenance}, pages = {33--40} }, @book{meyer_object-oriented_1997, title = {Object-oriented Software Construction}, publisher = {Prentice Hall Upper Saddle River, {NJ}}, author = {Meyer, Bertrand}, year = {1997}, keywords = {{OOD}, {OOP}} }, @article{flake_graph_2004, title = {Graph Clustering and Minimum Cut Trees}, volume = {1}, url = {http://akpeters.metapress.com/content/4641k236hj5k427j}, abstract = {In this paper, we introduce simple graph clustering methods based on minimum cuts within the graph. The clustering methods are general enough to apply to any kind of graph but are well suited for graphs where the link structure implies a notion of reference, similarity, or endorsement, such as web and citation graphs. We show that the quality of the produced clusters is bounded by strong minimum cut and expansion criteria. We also develop a framework for hierarchical clustering and present applications to real-world data. We conclude that the clustering algorithms satisfy strong theoretical criteria and perform well in practice.}, number = {4}, journal = {Internet Mathematics}, author = {Flake, Gary and Tarjan, Robert and Tsioutsiouliklis, Kostas}, month = jan, year = {2004}, keywords = {clustering, graphs}, pages = {385--408}, annote = {Applies graph theory to clustering. Goes beyond min-cuts by considering the intra-cluster and inter-cluster connectivity and also the sizes of the recommended clusters. Handles weighted edges, so this may be handy for considering the influence of special methods. Relevance: 5 } }, @article{bansiya_class_1999, title = {A class cohesion metric for object-oriented designs}, volume = {11}, journal = {Journal of Object Oriented Programming}, author = {Bansiya, J. and Etzkorn, L. and Davis, C. and Li, W.}, year = {1999}, keywords = {cohesion, metrics}, pages = {47--52} }, @article{shokoufandeh_spectral_2005, title = {Spectral and meta-heuristic algorithms for software clustering}, volume = {77}, issn = {0164-1212}, url = {http://www.sciencedirect.com/science/article/B6V0N-4F6SSM8-2/2/ae0dfb6aeba5d78917f2a5b86c203415}, doi = {10.1016/j.jss.2004.03.032}, abstract = {When large software systems are reverse engineered, one of the views that is produced is the system decomposition hierarchy. This hierarchy shows the system's subsystems, the contents of the subsystems (i.e., modules or other subsystems), and so on. Software clustering tools create the system decomposition automatically or semi-automatically with the aid of the software engineer. The Bunch software clustering tool shows how meta-heuristic search algorithms can be applied to the software clustering problem, successfully. Unfortunately, we do not know how close the solutions produced by Bunch are to the optimal solution. We can only obtain the optimal solution for trivial systems using an exhaustive search. This paper presents evidence that Bunch's solutions are within a known factor of the optimal solution. We show this by applying spectral methods to the software clustering problem. The advantage of using spectral methods is that the results this technique produces are within a known factor of the optimal solution. Meta-heuristic search methods only guarantee local optimality, which may be far from the global optimum. In this paper, we apply the spectral methods to the software clustering problem and make comparisons to Bunch. We conducted a case study to draw our comparisons and to determine if an efficient clustering algorithm, one that guarantees a near-optimal solution, can be created.}, number = {3}, journal = {Journal of Systems and Software}, author = {Shokoufandeh, Ali and Mancoridis, Spiros and Denton, Trip and Maycock, Matthew}, month = sep, year = {2005}, keywords = {clustering, search}, pages = {213--223} }, @inproceedings{ester_density-based_1996, address = {Portland, Oregon, {USA}}, title = {A density-based algorithm for discovering clusters in large spatial databases with noise}, volume = {96}, abstract = {Clustering algorithms are attractive for the task of class identification in spatial databases. However, the application to large spatial databases rises the following requirements for clustering algorithms: minimal requirements of domain knowledge to determine the input parameters, discovery of clusters with arbitrary shape and good efficiency on large databases. The well-known clustering algorithms offer no solution to the combination of these requirements. In this paper, we present the new clustering algorithm {DBSCAN} relying on a density-based notion of clusters which is designed to discover clusters of arbitrary shape. {DBSCAN} requires only one input parameter and supports the user in determining an appropriate value for it. We performed an experimental evaluation of the effectiveness and efficiency of {DBSCAN} using synthetic data and real data of the {SEQUOIA} 2000 benchmark. The results of our experiments demonstrate that (1) {DBSCAN} is significantly more effective in discovering clusters of arbitrary shape than the well-known algorithm {CLAR-ANS}, and that (2) {DBSCAN} outperforms {CLARANS} by a factor of more than 100 in terms of efficiency.}, booktitle = {Proceedings of the 2nd International Conference on Knowledge Discovery and Data mining}, publisher = {{AAAI} Press}, author = {Ester, Martin and Kriegel, {Hans-Peter} and Sander, Jorg and Xu, Xiaowei}, year = {1996}, pages = {226{\textendash}231} }, @inproceedings{ferenc_design_2005, title = {Design pattern mining enhanced by machine learning}, isbn = {1063-6773}, doi = {10.1109/ICSM.2005.40}, abstract = {Design patterns present good solutions to frequently occurring problems in object-oriented software design. Thus their correct application in a system's design may significantly improve its internal quality attributes such as reusability and maintainability. In software maintenance the existence of up-to-date documentation is crucial, so the discovery of as yet unknown design pattern instances can help improve the documentation. Hence a reliable design pattern recognition system is very desirable. However, simpler methods (based on pattern matching) may give imprecise results due to the vague nature of the patterns' structural description. In previous work we presented a pattern matching-based system using the Columbus framework with which we were able to find pattern instances from the source code by considering the patterns' structural descriptions only, and therefore we could not identify false hits and distinguish similar design patterns such as state and strategy. In the present work we use machine learning to enhance pattern mining by filtering out as many false hits as possible. To do so we distinguish true and false pattern instances with the help of a learning database created by manually tagging a large C++ system.}, booktitle = {Software Maintenance, 2005. {ICSM'05.} Proceedings of the 21st {IEEE} International Conference on}, author = {Ferenc, R. and Beszedes, A. and Fulop, L. and Lele, J.}, year = {2005}, keywords = {data mining, design pattern mining, design patterns, machine learning, maintenance, {OOD}, pattern matching, patterns, software reusability}, pages = {295--304} }, @misc{_weka_????, title = {Weka 3 - Data Mining with Open Source Machine Learning Software in Java}, url = {http://www.cs.waikato.ac.nz/ml/weka/}, note = {Accessed 2011-10-03}, howpublished = {http://www.cs.waikato.ac.nz/ml/weka/} }, @inproceedings{schulze_towards_2008, address = {Nashville, Tennessee, {USA}}, title = {Towards a Refactoring Guideline Using Code Clone Classication}, author = {Schulze, Sandro and Kuhlemann, Martin and Rosenmueller, Marko}, month = oct, year = {2008}, keywords = {clones, refactoring}, annote = {This paper discusses refactoring of cloned code from both an {OO} and an aspect-oriented perspective. It is also useful for the cloning references it has. Relevance: 4 } }, @article{grove_call_1997, title = {Call graph construction in object-oriented languages}, volume = {32}, url = {http://portal.acm.org/citation.cfm?id=263700.264352}, doi = {10.1145/263700.264352}, abstract = {Interprocedural analyses enable optimizing compilers to more precisely model the effects of non-inlined procedure calls, potentially resulting in substantial increases in application performance. Applying interprocedural analysis to programs written in object-oriented or functional languages is complicated by the difficulty of constructing an accurate program call graph. This paper presents a parameterized algorithmic framework for call graph construction in the presence of message sends and/or first class functions. We use this framework to describe and to implement a number of well-known and new algorithms. We then empirically assess these algorithms by applying them to a suite of medium-sized programs written in Cecil and Java, reporting on the relative cost of the analyses, the relative precision of the constructed call graphs, and the impact of this precision on the effectiveness of a number of interprocedural optimizations.}, number = {10}, journal = {{SIGPLAN} Not.}, author = {Grove, David and {DeFouw}, Greg and Dean, Jeffrey and Chambers, Craig}, year = {1997}, keywords = {call graph, graphs}, pages = {108--124} }, @article{rosvall_information-theoretic_2007, title = {An information-theoretic framework for resolving community structure in complex networks}, volume = {104}, url = {http://www.pnas.org/content/104/18/7327.abstract}, doi = {10.1073/pnas.0611034104}, abstract = {To understand the structure of a large-scale biological, social, or technological network, it can be helpful to decompose the network into smaller subunits or modules. In this article, we develop an information-theoretic foundation for the concept of modularity in networks. We identify the modules of which the network is composed by finding an optimal compression of its topology, capitalizing on regularities in its structure. We explain the advantages of this approach and illustrate them by partitioning a number of real-world and model networks.}, number = {18}, journal = {Proceedings of the National Academy of Sciences}, author = {Rosvall, Martin and Bergstrom, Carl T.}, month = may, year = {2007}, keywords = {{SNA}}, pages = {7327--7331} }, @inproceedings{steimann_towards_2003, address = {New York, {NY}, {USA}}, series = {{PPPJ} '03}, title = {Towards the systematic use of interfaces in {JAVA} programming}, isbn = {0-9544145-1-9}, location = {Kilkenny City, Ireland}, url = {http://portal.acm.org/citation.cfm?id=957289.957295}, abstract = {{JAVA's} interface construct is widely perceived as a weak surrogate for multiple inheritance. Consequently, it should come as no surprise that despite their potential for writing highly decoupled code, interfaces are used rather sparingly. We have devised a conceptual framework for the utilization of interfaces in {JAVA} programs, and suggest tool support lessening the coding effort induced by the introduction and maintenance of additional interfaces, as well as a metrics suit measuring how and to which extent interfaces are actually used.}, booktitle = {Proceedings of the 2nd international conference on Principles and practice of programming in Java}, publisher = {Computer Science Press, Inc.}, author = {Steimann, Friedrich and Siberski, Wolf and K\"{u}hne, Thomas}, year = {2003}, note = {{ACM} {ID:} 957295}, keywords = {interfaces}, pages = {13{\textendash}17} }, @inproceedings{wahler_clone_2004, title = {Clone detection in source code by frequent itemset techniques}, abstract = {In this paper we describe a new approach for the detection of clones in source code, which is inspired by the concept of frequent itemsets from data mining. The source code is represented as an abstract syntax tree in {XML.} Currently, such {XML} representations exist for instance for Java, C++, or {PROLOG.} Our approach is very flexible; it can be configured easily to work with multiple programming languages}, booktitle = {Source Code Analysis and Manipulation, 2004. Fourth {IEEE} International Workshop on}, author = {Wahler, V. and Seipel, D. and Wolff, J. and Fischer, G.}, year = {2004}, keywords = {clone detection, clustering, data mining, {XML}}, pages = {128--135} }, @article{wallnau_construction_1989, title = {Construction of {Knowledge-Based} Components and Applications in Ada}, volume = {1}, number = {4}, journal = {Intelligent Systems Review}, author = {Wallnau, Kurt and Solderitsch, James and Simos, Mark and {McDowell}, Raymond and Cassell, Keith and Campbell, David}, year = {1989}, annote = {Also available in the Proceedings of {AIDA}, Fourth Annual Conference on Artificial Intelligence and Ada. November, 1988. p.p. 3-1 through 3-21. } }, @inproceedings{chatzigeorgiou_application_2006, address = {New York, {NY}, {USA}}, title = {Application of graph theory to {OO} software engineering}, isbn = {{1-59593-409-X}}, location = {Shanghai, China}, doi = {http://doi.acm.org/10.1145/1137661.1137669}, abstract = {Graph Theory, which studies the properties of graphs, has been widely accepted as a core subject in the knowledge of computer scientists. So is {Object-Oriented} {(OO)} software engineering, which deals with the analysis, design and implementation of systems employing classes as modules. The latter field can greatly benefit from the application of Graph Theory, since the main mode of representation, namely the class diagram, is essentially a directed graph. The study of graph properties can be valuable in many ways for understanding the characteristics of the underlying software systems. Representative examples for the usefulness of graph theory on {OO} systems based on recent research results are presented in this paper.}, booktitle = {{WISER} '06: Proceedings of the 2006 international workshop on Workshop on interdisciplinary software engineering research}, publisher = {{ACM}}, author = {Chatzigeorgiou, Alexander and Tsantalis, Nikolaos and Stephanides, George}, year = {2006}, keywords = {design patterns, graphs}, pages = {29{\textemdash}36}, annote = {Discusses the representation of a program as a generic mathematical graph and conversions between {UML} and "standard" graphs. Four different applications of Graph Theory are demonstrated,: the identification of {"God"} classes, clustering, detection of design patterns and scale-freeness of {OO} systems. One interesting twist is the weighting of edges to indicate things such as the number of calls from one class to another. Relevance: 4 } }, @article{porter_algorithm_1980, title = {An algorithm for suffix stripping}, volume = {14}, issn = {1-55860-454-5}, url = {http://portal.acm.org/citation.cfm?id=275705}, number = {3}, journal = {Program}, author = {Porter, {MF}}, year = {1980}, keywords = {stemming}, pages = {130 -- 137} }, @inproceedings{counsell_object-oriented_2005, title = {Object-oriented cohesion as a surrogate of software comprehension: an empirical study}, isbn = {0-7695-2292-0}, shorttitle = {Object-oriented cohesion as a surrogate of software comprehension}, url = {http://portal.acm.org/citation.cfm?id=1100804}, abstract = {The concept of software cohesion in both the procedural and object-oriented paradigm is well known and documented. What is not so well known or documented is the perception of what empirically constitutes a cohesive unit by software engineers. In this paper, we describe an empirical investigation using object-oriented {(OO)} classes as a basis. Twenty four subjects (drawn from {IT} experienced and {IT} inexperienced groups) were asked to rate ten classes sampled from two industrial systems in terms of their overall cohesiveness; a class environment was used to carry out the study. Four key results were observed. Firstly, class size (when expressed in terms of number of methods) did not tend to influence the perception of cohesion by any subjects. Secondly, well-commented classes were rated most highly amongst both {IT} experienced and inexperienced subjects. Thirdly, the empirical study suggests that cohesion comprises a combination of various class factors including low coupling, small numbers of attributes and well commented methods, rather than any single, individual class feature per se. Finally, the research supports the view that cohesion is a subjective concept reflecting a cognitive combination of class features; as such it is a surrogate for class comprehension.}, booktitle = {Proceedings of the Fifth {IEEE} International Workshop on Source Code Analysis and Manipulation}, publisher = {{IEEE} Computer Society}, author = {Counsell, Steve and Swift, Stephen and Tucker, Allan}, year = {2005}, keywords = {cohesion, coupling, empirical}, pages = {161--172}, annote = {User study - users seemed to think that well-commented code was cohesive.} }, @inproceedings{poshyvanyk_integrating_2007, title = {Integrating {COTS} Search Engines into Eclipse: Google Desktop Case Study}, isbn = {0-7695-2966-6}, shorttitle = {Integrating {COTS} Search Engines into Eclipse}, url = {http://portal.acm.org/citation.cfm?id=1270232.1270285&coll=GUIDE&dl=GUIDE&CFID=101258307&CFTOKEN=28035505}, abstract = {The paper presents an integration of the Google Desktop Search {(GDS)} engine into the Eclipse development environment. The resulting tool, namely Google Eclipse Search {(GES)}, provides enhanced searching in Eclipse software projects. The paper advocates for a {COTS} component-based approach to develop useful and reliable research prototypes, which support various software maintenance tasks. The development effort for such the tools is reduced, while customization and flexibility, to fully support the needs of developers, is maintained. The proposed solution takes advantages of the power of {GDS} for quick and accurate searching and of Eclipse for great extensibility. The paper outlines our experiences of integrating {GDS} engine into Eclipse as well as possible extensions and applications of the proposed tool.}, booktitle = {Proceedings of the Second International Workshop on Incorporating {COTS} Software into Software Systems: Tools and Techniques}, publisher = {{IEEE} Computer Society}, author = {Poshyvanyk, Denys and Petrenko, Maksym and Marcus, Andrian}, year = {2007}, keywords = {Eclipse}, pages = {6} }, @article{counsell_interpretation_2006, title = {The interpretation and utility of three cohesion metrics for object-oriented design}, volume = {15}, url = {http://portal.acm.org/citation.cfm?doid=1131421.1131422}, doi = {10.1145/1131421.1131422}, abstract = {The concept of cohesion in a class has been the subject of various recent empirical studies and has been measured using many different metrics. In the structured programming paradigm, the software engineering community has adopted an informal yet meaningful and understandable definition of cohesion based on the work of Yourdon and Constantine. The object-oriented {(OO)} paradigm has formalised various cohesion measures, but the argument over the most meaningful of those metrics continues to be debated. Yet achieving highly cohesive software is fundamental to its comprehension and thus its maintainability. In this article we subject two object-oriented cohesion metrics, {CAMC} and {NHD}, to a rigorous mathematical analysis in order to better understand and interpret them. This analysis enables us to offer substantial arguments for preferring the {NHD} metric to {CAMC} as a measure of cohesion. Furthermore, we provide a complete understanding of the behaviour of these metrics, enabling us to attach a meaning to the values calculated by the {CAMC} and {NHD} metrics. In addition, we introduce a variant of the {NHD} metric and demonstrate that it has several advantages over {CAMC} and {NHD.} While it may be true that a generally accepted formal and informal definition of cohesion continues to elude the {OO} software engineering community, there seems considerable value in being able to compare, contrast, and interpret metrics which attempt to measure the same features of software.}, number = {2}, journal = {{ACM} Trans. Softw. Eng. Methodol.}, author = {Counsell, Steve and Swift, Stephen and Crampton, Jason}, year = {2006}, keywords = {cohesion, metrics}, pages = {123--149} }, @article{marin_identifying_2007, title = {Identifying crosscutting concerns using fan-in analysis}, volume = {17}, url = {http://portal.acm.org/citation.cfm?id=1314496}, doi = {10.1145/1314493.1314496}, abstract = {Aspect mining is a reverse engineering process that aims at finding crosscutting concerns in existing systems. This article proposes an aspect mining approach based on determining methods that are called from many different places, and hence have a high fan-in, which can be seen as a symptom of crosscutting functionality. The approach is semiautomatic, and consists of three steps: metric calculation, method filtering, and call site analysis. Carrying out these steps is an interactive process supported by an Eclipse plug-in called {FINT.} Fan-in analysis has been applied to three open source Java systems, totaling around 200,000 lines of code. The most interesting concerns identified are discussed in detail, which includes several concerns not previously discussed in the aspect-oriented literature. The results show that a significant number of crosscutting concerns can be recognized using fan-in analysis, and each of the three steps can be supported by tools.}, number = {1}, journal = {{ACM} Trans. Softw. Eng. Methodol.}, author = {Marin, Marius and Deursen, Arie Van and Moonen, Leon}, year = {2007}, keywords = {aspects, crosscutting concern, Eclipse, metrics, reverse engineering}, pages = {1--37} }, @article{bavota_identifying_2011, title = {Identifying Extract Class refactoring opportunities using structural and semantic cohesion measures}, volume = {84}, issn = {0164-1212}, url = {http://www.sciencedirect.com/science/article/B6V0N-51MDS92-2/2/d86d07983c55c9e6345b10606a8b08e2}, doi = {10.1016/j.jss.2010.11.918}, abstract = {Abstract Approaches for improving class cohesion identify refactoring opportunities using metrics that capture structural relationships between the methods of a class, e.g., attribute references. Semantic metrics, e.g., C3 metric, have also been proposed to measure class cohesion, as they seem to complement structural metrics. However, until now semantic relationships between methods have not been used to identify refactoring opportunities. In this paper we propose an Extract Class refactoring method based on graph theory that exploits structural and semantic relationships between methods. The empirical evaluation of the proposed approach highlighted the benefits provided by the combination of semantic and structural measures and the potential usefulness of the proposed method as a feature for software development environments.}, number = {3}, journal = {Journal of Systems and Software}, author = {Bavota, Gabriele and De Lucia, Andrea and Oliveto, Rocco}, month = mar, year = {2011}, keywords = {empirical, extract class, Graph Theory, {MaxFlow-MinCut}, refactoring, semantics}, pages = {397--414} }, @inproceedings{fleissner_commensalistic_2006, address = {Portland, Oregon, {USA}}, title = {A commensalistic software system}, isbn = {{1-59593-491-X}}, url = {http://portal.acm.org/citation.cfm?id=1176617.1176629&coll=ACM&dl=ACM&type=series&idx=SERIES318&part=series&WantType=Proceedings&title=OOPSLA&CFID=91794590&CFTOKEN=23878168}, doi = {10.1145/1176617.1176629}, abstract = {The development of reliable software is a challenging task, especially in a business environment that forces developers to focus on meeting tight deadlines instead of producing quality software. Researchers and practitioners are exploring various approaches for addressing this problem, such as autonomic computing and conscientious autopoietic software. These approaches describe software systems that are capable of managing and preserving themselves. In this paper, we propose a new, concrete self-managing software architecture based on the biological concept of commensalistic symbiosis and the notion of autopoietic software. We present a detailed description of our architecture, and a working prototype of a minimal commensalistic system. In addition, we specify a new programming language, examine usage scenarios and discuss implementation issues for realizing a working commensalistic system on a larger scale.}, booktitle = {Companion to the 21st {ACM} {SIGPLAN} symposium on Object-oriented programming systems, languages, and applications}, publisher = {{ACM}}, author = {Fleissner, Sebastian and Baniassad, Elisa}, year = {2006}, keywords = {autonomic computing, symbiosis}, pages = {560--573} }, @inproceedings{bhatti_reconsidering_2008, title = {Reconsidering Classes in Procedural {Object-Oriented} Code}, abstract = {Object-oriented software may show signs of procedural thinking because of lack of design or due to design erosion over a period of time. We refer to such a software as procedural object-oriented code. Huge classes, scarce class hierarchies and absence of classes for domain entities are hallmarks of procedural object-oriented code. Due to huge investments in such systems, software restructuring becomes necessary. To support code modularization, it is important to identify useful domain abstractions. In this paper, we present a tool-assisted technique to identify useful abstractions and class hierarchies in procedural object-oriented code. During this task, principal classes (draft classes) are identified. Afterwards, composition and association relationships are inferred for principal classes. Lastly, Formal Concept Analysis {(FCA)} is used to analyze hierarchical relationships between methods and attributes within principal classes. We validated our approach on several case studies and report our results on an industrial case.}, booktitle = {Proceedings of the 2008 15th Working Conference on Reverse {Engineering-Volume} 00}, publisher = {{IEEE} Computer Society Washington, {DC}, {USA}}, author = {Bhatti, Muhammed and Ducasse, Stephane and Huchard, Marianne}, year = {2008}, keywords = {{FCA}, restructuring}, pages = {257--266} }, @book{ra_mining_2008, address = {Berlin, Heidelberg}, series = {Lecture Notes in Computer Science}, title = {Mining Complex Data {ECML/PKDD} 2007 Third International Workshop, {MCD} 2007, Warsaw, Poland, September 17-21, 2007, Revised Selected Papers}, isbn = {9783540684169}, number = {4944}, publisher = {{Springer-Verlag} Berlin Heidelberg}, author = {Ra\'{s}, Zbigniew W and Tsumoto, Shusaku and Zighed, Djamel and {SpringerLink} {(Online} service)}, year = {2008}, keywords = {data mining} }, @inproceedings{perez_perspectives_2009, address = {Amsterdam, The Netherlands}, series = {{IWPSE-Evol} '09}, title = {Perspectives on automated correction of bad smells}, isbn = {978-1-60558-678-6}, url = {http://portal.acm.org/citation.cfm?id=1595827}, doi = {10.1145/1595808.1595827}, abstract = {Keeping a software system conformant with a desired architecture and consistent with good design principles is a recurring task during the software evolution process. Deviations from good design principles can manifest in the form of bad smells: problems in the system's structure that can negatively affect software quality factors.}, booktitle = {Proceedings of the Joint International and Annual {ERCIM} Workshops on Principles of Software Evolution {(IWPSE)} and Software Evolution {(Evol)} Workshops}, publisher = {{ACM}}, author = {P\'{e}rez, Javier and Crespo, Yania}, year = {2009}, keywords = {automated planning, refactoring, smells}, pages = {99--108} }, @article{soares_making_2010, title = {Making Program Refactoring Safer}, volume = {27}, issn = {0740-7459}, doi = {10.1109/MS.2010.63}, abstract = {Developers rely on compilation, test suite and tools to preserve observable behavior during refactoring. However, most of the refactoring tools do not implement all preconditions that guarantee the refactoring correctness, since formally identifying them is cost-prohibitive. Therefore, these tools may perform non-behavior preserving transformations. We present a tool for improving safety during refactoring. It automatically generates a test suite that is suited for detecting behavioral changes. We used our tool to evaluate seven real case study refactorings (from 3 to 100 {KLOC).} We reason about a {JHotDraw} (23 {KLOC)} and its refactored version, and automatically detected a behavioral change. This problem was not identified by developers. Finally, we also evaluated our tool against 17 defective refactorings that are not detected by refactoring tools. It automatically detects most of those errors within a few seconds.}, number = {4}, journal = {{IEEE} Software}, author = {Soares, Gustavo and Gheyi, Rohit and Serey, Dalton and Massoni, Tiago}, year = {2010}, keywords = {refactoring, unit testing}, pages = {52--57} }, @inproceedings{kirasi_ontology-based_2008, address = {Zagreb, Croatia}, title = {{Ontology-Based} Design Pattern Recognition}, isbn = {978-3-540-85562-0}, url = {http://portal.acm.org/citation.cfm?id=1430106&jmp=cit&coll=GUIDE&dl=ACM}, abstract = {This paper presents ontology-based architecture for pattern recognition in the context of static source code analysis. The proposed system has three subsystems: parser, {OWL} ontologies and analyser. The parser subsystem translates the input code to {AST} that is constructed as an {XML} tree. The {OWL} ontologies define code patterns and general programming concepts. The analyser subsystem constructs instances of the input code as ontology individuals and asks the reasoner to classify them. The experience gained in the implementation of the proposed system and some practical issues are discussed. The recognition system successfully integrates the knowledge representation field and static code analysis, resulting in greater flexibility of the recognition system.}, booktitle = {Proceedings of the 12th international conference on {Knowledge-Based} Intelligent Information and Engineering Systems, Part I}, publisher = {{Springer-Verlag}}, author = {Kirasi\'{c}, Damir and Basch, Danko}, year = {2008}, keywords = {description logics, kbs, ontologies, {OWL}, static code analysis}, pages = {384--393}, annote = {Uses patterns coded in {OWL} and {SWRL} rules to identify instances from {ASTs.} Relevance: 5 } }, @inproceedings{strehl_impact_2000, title = {Impact of Similarity Measures on Web-page Clustering}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.41.2110}, abstract = {Clustering of web documents enables (semi-)automated categorization, and facilitates certain types of search. Any clustering method has to embed the documents in a suitable similarity space. While several clustering methods and the associated similarity measures have been proposed in the past, there is no systematic comparative study of the impact of similarity metrics on cluster quality, possibly because the popular cost criteria do not readily translate across qualitatively dierent metrics. We observe that in domains such as Yahoo that provide a categorization by human experts, a useful criteria for comparisons across similarity metrics is indeed available. We then compare four popular similarity measures {(Euclidean}, cosine, Pearson correlation and extended Jaccard) in conjunction with several clustering techniques (random, self-organizing feature map, hyper-graph partitioning, generalized k- means, weighted graph partitioning), on high dimensional sparse data representing web documents. Performance is measured against a human-imposed classi cation into news categories and industry categories. We conduct a number of experiments and use t-tests to assure statistical signicance of results. Cosine and extended Jaccard similarities emerge as the best measures to capture human categorization behavior, while Euclidean performs poorest. Also, weighted graph partitioning approaches are clearly superior to all others.}, booktitle = {Workshop on Artificial Intelligence for Web Search {(AAAI} 2000)}, publisher = {{AAAI}}, author = {Strehl, Alexander and Ghosh, Joydeep and Mooney, Raymond}, year = {2000}, keywords = {clustering, similarity, world wide web}, pages = {58---64} }, @article{kang_quantitative_1999, title = {A quantitative framework for software restructuring}, volume = {11}, url = {http://portal.acm.org/citation.cfm?id=325528}, number = {4}, journal = {Journal of Software Maintenance}, author = {Kang, {Byung-Kyoo} and Bieman, James M.}, year = {1999}, keywords = {metrics, restructuring}, pages = {245--284} }, @article{koru_investigation_2009, title = {An Investigation into the Functional Form of the {Size-Defect} Relationship for Software Modules}, volume = {35}, abstract = {The importance of the relationship between the size and defect proneness of software modules is well recognized. Understanding the nature of that relationship can facilitate various development decisions related to prioritization of quality assurance activities. Overall, the previous research only drew a general conclusion that there was a monotonically increasing relationship between module size and defect proneness. In this study, we analyzed class-level size and defect data in order to increase our understanding of this crucial relationship. We studied four large-scale object-oriented products, Mozilla, Cn3d, {JBoss}, and Eclipse. We observed that defect proneness increased as class size increased, but at a slower rate; smaller classes were proportionally more problematic than larger classes. Therefore, practitioners should consider giving higher priority to smaller modules when planning focused quality assurance activities with limited resources. For example, in Mozilla and Eclipse, an inspection strategy investing 80 percent of available resources on {100-LOC} classes and the rest on {1,000-LOC} classes would be more than twice as cost-effective as the opposite strategy. These results should be immediately useful to guide focused quality-assurance activities in large-scale software projects.}, number = {2}, journal = {{IEEE} Transactions on Software Engineering}, author = {Koru, A. Gunes and Zhang, Dongsong and Emam, Khaled El and Liu, Hongfang}, year = {2009}, keywords = {metrics, size}, pages = {293--304} }, @article{tip_refactoring_2007, title = {Refactoring using type constraints}, abstract = {Type constraints express subtype-relationships between the types of program expressions that are required for type-correctness, and were originally proposed as a convenient framework for solving type checking and type inference problems. In this paper, we show how type constraints can be used as the basis for practical refactoring tools. In our approach, a set of type constraints is derived from a type-correct program P. The main insight behind our work is the fact that P constitutes just one solution to this constraint system, and that alternative solutions may exist that correspond to refactored versions of P. We show how a number of refactorings for manipulating types and class hierarchies can be expressed naturally using type constraints. Several refactorings in the standard distribution of Eclipse are based on our results.}, journal = {Static Analysis}, author = {Tip, Frank}, year = {2007}, keywords = {refactoring}, pages = {1{\textendash}17} }, @article{bodhuin_sormasa:_2007, title = {{SORMASA:} A tool for suggesting model refactoring actions by metrics-led genetic algorithm}, abstract = {In this paper we introduce {SORMASA}, {SOftware} Refactor- ing using software Metrics And Search Algorithms, a refactoring decision support tool based on optimization techniques, in particular Genetic Algorithms.}, journal = {1st Workshop on Refactoring Tools {(WRT'07)}}, author = {Bodhuin, T. and Canfora, G. and Troiano, L.}, year = {2007}, keywords = {genetic algorithms, refactoring}, pages = {23--24} }, @inproceedings{jin_huang_aspect_2010, title = {Aspect Mining Using Link Analysis}, doi = {10.1109/FCST.2010.20}, abstract = {Aspect mining is a technique that decouples the crosscutting concerns from existing software systems. The goal of aspect mining is to refactor the existing software systems with Aspect Oriented Programming technology. Inspired by the link analysis of information retrieval technology, this paper describes a two-state model to approximate how crosscutting concerns can be discovered in the concern graphs extracted from programs. Our mining algorithm generates {\textquotedblright}scatter{\textquotedblright} and {\textquotedblright}centralization{\textquotedblright} of each program element for the final ranking. The convergency of the algorithm proves fast. The Ranking technique, considering both {\textquotedblright}scatter{\textquotedblright} and {\textquotedblright}centralization{\textquotedblright}, produces a final ranking for identifying crosscutting concerns. Our aspect mining approach is evaluated on numerous Java programs that are of the typical selections for aspect mining. Compared with existing aspect mining approaches, our mining approach captures more information that helps domain experts refactor software systems and prove effective in identifying crosscutting concerns.}, booktitle = {Frontier of Computer Science and Technology {(FCST)}, 2010 Fifth International Conference on}, author = {Jin Huang and Yansheng Lu and Jing Yang}, year = {2010}, keywords = {aspects, data mining, link analysis, refactoring}, pages = {312--317} }, @article{steimann_patterns_2005, title = {Patterns of {Interface-Based} Programming}, volume = {4}, abstract = {Modern software architectures heavily promote the use of interfaces. Originally conceived as a means to separate specification from implementation, popular programming languages toady accommodate interfaces as special kinds of types that can be used {\textendash} in place of classes {\textendash} in variable declarations. While it is clear that these interfaces offer polymorphism independent of the inheritance hierarchy, little has been said about the systematic use of interfaces, or how they are actually used in practice. By providing a set of basic patterns of interface use together with numbers of their frequency we provide insights that should be of interest not only to the practising programmer, but also to the designers and analysts of large code bases.}, number = {5}, journal = {Journal Of Object Technology}, author = {Steimann, Friedrich and Mayer, Philip}, month = aug, year = {2005}, keywords = {interfaces}, pages = {75--94} }, @article{baker_parameterized_1997, title = {Parameterized Duplication in Strings: Algorithms and an Application to Software Maintenance}, volume = {26}, shorttitle = {Parameterized Duplication in Strings}, url = {http://portal.acm.org/citation.cfm?id=264398}, abstract = {As an aid in software maintenance, it would be useful to be able to track down duplication in large software systems efficiently. Duplication in code is often in the form of sections of code that are the same except for a systematic change of parameters such as identifiers and constants. To model such parameterized duplication in code, this paper introduces the notions of parameterized strings and parameterized matches of parameterized strings. A data structure called a parameterized suffix tree is defined to aid in searching for parameterized matches. For fixed alphabets, algorithms are given to construct a parameterized suffix tree in linear time and to find all maximal parameterized matches over a threshold length in a parameterized p-string in time linear in the size of the input plus the number of matches reported. The algorithms have been implemented, and experimental results show that they perform well on C code.}, number = {5}, journal = {{SIAM} J. Comput.}, author = {Baker, Brenda S.}, year = {1997}, keywords = {clones, pattern matching, string matching}, pages = {1343--1362} }, @inproceedings{gui_new_2008, address = {Hunan}, title = {New coupling and cohesion metrics for evaluation of software component reusability}, doi = {10.1109/ICYCS.2008.270}, abstract = {An account of new measure of coupling and cohesion developed to assess the reusability of Java components is proposed in this paper. These measures differ from the majority of established metrics in two respects: they reflect the degree to which entities are coupled or resemble each other, and they take account of indirect couplings or similarities. An empirical comparison of the new measures with eight established metrics is described. The new measures are shown to be consistently superior at measuring component reusability.}, booktitle = {The 9th International Conference for Young Computer Scientists, 2008. {ICYCS} 2008.}, author = {Gui, G. and Scott, P. D}, year = {2008}, keywords = {cohesion, coupling, indirect coupling, metrics}, pages = {1181{\textendash}1186} }, @inproceedings{foote_big_1997, address = {Monticello, Illinois}, title = {Big ball of mud}, abstract = {While much attention has been focused on high-level software architectural patterns, what is, in effect, the de-facto standard software architecture is seldom discussed. This paper examines this most frequently deployed of software architectures: the {BIG} {BALL} {OF} {MUD.} A {BIG} {BALL} {OF} {MUD} is a casually, even haphazardly, structured system. Its organization, if one can call it that, is dictated more by expediency than design. Yet, its enduring popularity cannot merely be indicative of a general disregard for architecture. These patterns explore the forces that encourage the emergence of a {BIG} {BALL} {OF} {MUD}, and the undeniable effectiveness of this approach to software architecture. What are the people who build them doing right? If more high-minded architectural approaches are to compete, we must understand what the forces that lead to a {BIG} {BALL} {OF} {MUD} are, and examine alternative ways to resolve them. A number of additional patterns emerge out of the {BIG} {BALL} {OF} {MUD.} We discuss them in turn. Two principal questions underlie these patterns: Why are so many existing systems architecturally undistinguished, and what can we do to improve them?}, author = {Foote, Brian and Yoder, Joseph}, year = {1997}, keywords = {maintenance, patterns, size, smells}, annote = {"refactor unrelentingly"} }, @inproceedings{pedersen_wordnet::_2004, title = {Wordnet:: similarity-measuring the relatedness of concepts}, booktitle = {Proceedings of the National Conference on Artificial Intelligence}, publisher = {Menlo Park, {CA;} Cambridge, {MA;} London; {AAAI} Press; {MIT} Press; 1999}, author = {Pedersen, T. and Patwardhan, S. and Michelizzi, J.}, year = {2004}, keywords = {semantic networks, semantics}, pages = {1024--1025} }, @article{schaeffer_graph_2007, title = {Graph clustering}, volume = {1}, issn = {1574-0137}, url = {http://www.sciencedirect.com/science/article/B8JDG-4PBG1S7-1/2/6537f3d1ffbf391086c60dbeba874b13}, doi = {10.1016/j.cosrev.2007.05.001}, abstract = {In this survey we overview the definitions and methods for graph clustering, that is, finding sets of "related" vertices in graphs. We review the many definitions for what is a cluster in a graph and measures of cluster quality. Then we present global algorithms for producing a clustering for the entire vertex set of an input graph, after which we discuss the task of identifying a cluster for a specific seed vertex by local computation. Some ideas on the application areas of graph clustering algorithms are given. We also address the problematics of evaluating clusterings and benchmarking cluster algorithms.}, number = {1}, journal = {Computer Science Review}, author = {Schaeffer, Satu Elisa}, month = aug, year = {2007}, keywords = {clustering, graphs, survey}, pages = {27--64}, annote = {In addition to the survey, this paper gives good background material on complexity, graph theory, approximation, and Markov Chains.} }, @article{fowler_public_2002, title = {Public versus published interfaces}, volume = {19}, url = {http://portal.acm.org/citation.cfm?id=626376}, number = {2}, journal = {{IEEE} Software}, author = {Fowler, Martin}, year = {2002}, keywords = {interfaces}, pages = {18--19} }, @misc{qualitas_research_group_qualitas_2010, title = {Qualitas Corpus Version 20101126}, copyright = {University of Auckland}, url = {qualitascorpus.com}, author = {Qualitas Research Group}, month = nov, year = {2010}, howpublished = {qualitascorpus.com} }, @inproceedings{cassell_dual_2011, address = {Miami, {FL}}, title = {A Dual Clustering Approach to the Extract Class Refactoring}, abstract = {Large classes typically have many internal interactions between their members, making them difficult to understand and expensive to maintain. When large classes are split into smaller, more cohesive classes, maintainability costs can be reduced; however, the very complexity that makes the classes costly to maintain also makes them difficult to split. Our {ExtC} tool uses clustering techniques to help solve this problem. By clustering based on the structural characteristics of the class, followed by additional clustering based on semantics, {ExtC} divides the members of large classes into groups. These groups provide the basis for refactoring the large classes into smaller, more cohesive ones.}, booktitle = {Proceedings of the 23rd International Conference on Software Engineering \& Knowledge Engineering}, author = {Cassell, Keith and Andreae, Peter and Groves, Lindsay}, month = jul, year = {2011}, pages = {77--82} }, @inproceedings{schleimer_winnowing:_2003, title = {Winnowing: local algorithms for document fingerprinting}, shorttitle = {Winnowing}, booktitle = {Proceedings of the 2003 {ACM} {SIGMOD} international conference on Management of data}, publisher = {{ACM} New York, {NY}, {USA}}, author = {Schleimer, Saul and Wilkerson, Daniel and Aiken, Alex}, year = {2003}, pages = {76--85} }, @article{mens_graph-based_2002, title = {A {Graph-Based} Metamodel for {Object-Oriented} Software Metrics}, volume = {72}, issn = {1571-0661}, url = {http://www.sciencedirect.com/science/article/B75H1-4G35F72-8/2/c520d497f622ada14db6aa12806f76e3}, doi = {10.1016/S1571-0661(05)80529-8}, abstract = {Metrics are essential in object-oriented software engineering for several reasons, among which quality assessment and improvement of development team productivity. While the mathematical nature of metrics calls for clear definitions, frequently there exist many contradicting definitions of the same metric depending on the implementation language. We suggest to express and define metrics using a language-independent metamodel based on graphs. This graph-based approach allows for an unambiguous definition of generic object-oriented metrics and higher-order metrics. We also report on some prototype tools that implement these ideas. We thank Kim Mens, Roel Wuyts and the anonymous reviewers for their comments on drafts of this paper.}, number = {2}, journal = {Electronic Notes in Theoretical Computer Science}, author = {Mens, Tom and Lanza, Michele}, month = nov, year = {2002}, keywords = {graphs, metrics, semantic networks}, pages = {57--68} }, @inproceedings{beck_industrial_1996, address = {Washington, {DC}, {USA}}, series = {{ICSE} '96}, title = {Industrial experience with design patterns}, isbn = {0-8186-7246-3}, location = {Berlin, Germany}, url = {http://portal.acm.org/citation.cfm?id=227726.227747}, abstract = {A design pattern is a particular prose form of recording design information such that designs which have worked well in the past can be applied again in similar situations in the future. The availability of a collection of design patterns can help both the experienced and the novice designer recognize situations in which design reuse could or should occur. We have found that design patterns: 1) provide an effective "shorthand" for communicating complex concepts effectively between designers, 2) can be used to record and encourage the reuse of "best practices", 3) capture the essential parts of a design in compact form, e.g. for documentation of existing software architectures. Since the patterns community is one that shares information in an open forum and builds on the experiences of others, we chose to submit a joint paper on our industrial experiences with patterns. We focus on the lessons learned in our respective industrial settings as a first step towards answering the questions {"Patterns} sound very promising, but how are they actually used in the industry and what benefits, if any, do they bring in practice?" We proceed by briefly describing each of our respective experiences with patterns. This is followed by a joint "lessons learned" section and conclusion.}, booktitle = {Proceedings of the 18th International Conference on Software Engineering}, publisher = {{IEEE} Computer Society}, author = {Beck, Kent and Crocker, Ron and Meszaros, Gerard and Vlissides, John and Coplien, James O and Dominick, Lutz and Paulisch, Frances}, year = {1996}, keywords = {design}, pages = {103{\textendash}114}, annote = {Mentions spaghetti classes} }, @book{pressman_software_1997, address = {New York}, edition = {4}, title = {Software Engineering: A Practitioner's Approach}, isbn = {0070521824}, lccn = {{QA76.758} {.P75} 1997}, shorttitle = {Software Engineering}, publisher = {{McGraw-Hill}}, author = {Pressman, Roger S}, year = {1997}, keywords = {maintenance} }, @inproceedings{grechanik_empirical_2010, address = {New York, {NY}, {USA}}, series = {{ESEM} '10}, title = {An empirical investigation into a large-scale Java open source code repository}, isbn = {978-1-4503-0039-1}, location = {{Bolzano-Bozen}, Italy}, doi = {10.1145/1852786.1852801}, abstract = {Getting insight into different aspects of source code artifacts is increasingly important -- yet there is little empirical research using large bodies of source code, and subsequently there are not much statistically significant evidence of common patterns and facts of how programmers write source code. We pose 32 research questions, explain rationale behind them, and obtain facts from 2,080 randomly chosen Java applications from Sourceforge. Among these facts we find that most methods have one or zero arguments or they do not return any values, few methods are overridden, most inheritance hierarchies have the depth of one, close to 50\% of classes are not explicitly inherited from any classes, and the number of methods is strongly correlated with the number of fields in a class.}, booktitle = {Proceedings of the 2010 {ACM-IEEE} International Symposium on Empirical Software Engineering and Measurement}, publisher = {{ACM}}, author = {Grechanik, Mark and {McMillan}, Collin and {DeFerrari}, Luca and Comi, Marco and Crespi, Stefano and Poshyvanyk, Denys and Fu, Chen and Xie, Qing and Ghezzi, Carlo}, year = {2010}, note = {{ACM} {ID:} 1852801}, keywords = {complexity measures, empirical, metrics, mining software repositories, open source, patterns, software repository}, pages = {11:1{\textendash}11:10} }, @article{kanellopoulos_improved_2007, title = {An improved methodology on information distillation by mining program source code}, volume = {61}, url = {http://portal.acm.org/citation.cfm?id=1231808}, abstract = {This paper presents a methodology for knowledge acquisition from source code. We use data mining to support semi-automated software maintenance and comprehension and provide practical insights into systems specifics, assuming one has limited prior familiarity with these systems. We propose a methodology and an associated model for extracting information from object oriented code by applying clustering and association rules mining. K-means clustering produces system overviews and deductions, which support further employment of an improved version of {MMS} Apriori that identifies hidden relationships between classes, methods and member data. The methodology is evaluated on an industrial case study, results are discussed and conclusions are drawn.}, number = {2}, journal = {Data Knowl. Eng.}, author = {Kanellopoulos, Y. and Makris, C. and Tjortjis, C.}, year = {2007}, keywords = {data mining}, pages = {359--383} }, @inproceedings{parnin_catalogue_2008, address = {Ammersee, Germany}, title = {A catalogue of lightweight visualizations to support code smell inspection}, isbn = {978-1-60558-112-5}, url = {http://portal.acm.org/citation.cfm?id=1409733&dl=GUIDE&coll=GUIDE&CFID=21397908&CFTOKEN=14373822}, doi = {10.1145/1409720.1409733}, abstract = {Preserving the integrity of software systems is essential in ensuring future product success. Commonly, companies allocate only a limited budget toward perfective maintenance and instead pressure developers to focus on implementing new features. Traditional techniques, such as code inspection, consume many staff resources and attention from developers. Metrics automate the process of checking for problems but produce voluminous, imprecise, and incongruent results. An opportunity exists for visualization to assist where automated measures have failed; however, current software visualization techniques only handle the voluminous aspect of data but fail to address imprecise and incongruent aspects. In this paper, we describe several techniques for visualizing possible defects reported by automated inspection tools. We propose a catalogue of lightweight visualizations that assist reviewers in weeding out false positives. We implemented the visualizations in a tool called {NOSEPRINTS} and present a case study on several commercial systems and open source applications in which we examined the impact of our tool on the inspection process.}, booktitle = {Proceedings of the 4th {ACM} Symposium on Software Visuallization}, publisher = {{ACM}}, author = {Parnin, Chris and G\"{o}rg, Carsten and Nnadi, Ogechi}, year = {2008}, keywords = {refactoring, smells, visualization}, pages = {77--86} }, @article{heer_tour_2010, title = {A tour through the visualization zoo}, volume = {53}, url = {http://portal.acm.org/ft_gateway.cfm?id=1743567&type=html&coll=GUIDE&dl=GUIDE&CFID=90293502&CFTOKEN=64298228}, doi = {10.1145/1743546.1743567}, abstract = {A survey of powerful visualization techniques, from the obvious to the obscure.}, number = {6}, journal = {Commun. {ACM}}, author = {Heer, Jeffrey and Bostock, Michael and Ogievetsky, Vadim}, year = {2010}, keywords = {visualization}, pages = {59--67} }, @phdthesis{wills_automated_1992, title = {Automated program recognition by graph parsing}, url = {http://users.ece.gatech.edu/~linda/phd-thesis.html}, school = {Massachusetts Institute of Technology}, author = {Wills, Linda Mary}, year = {1992}, keywords = {graphs}, annote = {available at http://users.ece.gatech.edu/{\textasciitilde}linda/phd-thesis.html as scanned {PostScript} chapters. Abstract: The recognition of standard computational structures (cliches) in a program can help an experienced programmer understand the program. Based on the known relationships between the cliches, a hierarchical description of the program's design can be recovered. We develop and study a graph parsing approach to automating program recognition in which programs are represented as attributed dataflow graphs and a library of cliches is encoded as an attributed graph grammar. Graph parsing is used to recognize cliches in the code. This approach has been implemented in a system called {GRASPR}, which stands for {``GRAph-based} System for Program Recognition.'' Relevance: 4} }, @article{chaumun_change_2002, title = {A change impact model for changeability assessment in object-oriented software systems}, volume = {45}, abstract = {The assessment of the changeability of software systems is of major concern for buyers of the large systems found in fast-moving domains such as telecommunications. One way of approaching this problem is to investigate the dependency between the changeability of the software and its design, with the goal of finding design properties that can be used as changeability indicators. In our research, we defined a model of software changes and change impacts and implemented it for the C++ language. Furthermore, we identified a set of nine object-oriented {(OO)} design metrics, four of which are specifically geared towards changeability detection. The model and the metrics were applied to three test systems of industrial size. The experiment showed a high correlation, across systems and across changes, between changeability and the access to a class by other classes through method invocation or variable access. On the other hand, no result could support the hypothesis that the depth of the inheritance tree has some influence on changeability. Furthermore, our results confirm the observation of others that the use of inheritance is rather limited in industrial systems.}, number = {2}, journal = {Science of Computer Programming}, author = {Chaumun, M. A. and Kabaili, H. and Keller, R. K. and Lustman, F.}, year = {2002}, keywords = {maintainability, metrics}, pages = {155--174} }, @inproceedings{dietrich_cluster_2008, address = {Ammersee, Germany}, title = {Cluster analysis of Java dependency graphs}, isbn = {978-1-60558-112-5}, doi = {10.1145/1409720.1409735}, abstract = {We present a novel approach to the analysis of dependency graphs of object-oriented programs. We propose to use the {Girvan-Newman} clustering algorithm to compute the modular structure of programs. This is useful in assisting software engineers to redraw component boundaries in software, in order to improve the level of reuse and maintainability. The results of this analysis can be used as a starting point for refactoring the software. We present {BARRIO}, an Eclipse plugin that can detect and visualise clusters in dependency graphs extracted from Java programs by means of source code and byte code analysis. These clusters are then compared with the modular structure of the analysed programs defined by package and container specifications. Two metrics are introduced to measure the degree of overlap between the defined and the computed modular structure. Some empirical results obtained from analysing non-trivial software packages are presented.}, booktitle = {Proceedings of the 4th {ACM} Symposium on Software Visualization}, publisher = {{ACM}}, author = {Dietrich, Jens and Yakovlev, Vyacheslav and {McCartin}, Catherine and Jenson, Graham and Duchrow, Manfred}, year = {2008}, keywords = {betweenness, clustering, dependency analysis, Eclipse, refactoring, software clustering, subsystem identification}, pages = {91--94}, annote = {http://portal.acm.org/citation.cfm?doid=1409720.1409735}, annote = {Mentions social network concepts like betweenness. Makes use of Jung for the {Girvan-Newman} algorithm and Prefuse for visualization. The emphasis is on (re)packaging. Relevance: 4 } }, @article{kriegel_clustering_2009, title = {Clustering high-dimensional data: A survey on subspace clustering, pattern-based clustering, and correlation clustering}, volume = {3}, issn = {1556-4681}, shorttitle = {Clustering high-dimensional data}, doi = {10.1145/1497577.1497578}, abstract = {As a prolific research area in data mining, subspace clustering and related problems induced a vast quantity of proposed solutions. However, many publications compare a new proposition{\textemdash}if at all{\textemdash}with one or two competitors, or even with a so-called {\textquotedblleft}na\"{i}ve{\textquotedblright} ad hoc solution, but fail to clarify the exact problem definition. As a consequence, even if two solutions are thoroughly compared experimentally, it will often remain unclear whether both solutions tackle the same problem or, if they do, whether they agree in certain tacit assumptions and how such assumptions may influence the outcome of an algorithm. In this survey, we try to clarify: (i) the different problem definitions related to subspace clustering in general; (ii) the specific difficulties encountered in this field of research; (iii) the varying assumptions, heuristics, and intuitions forming the basis of different approaches; and (iv) how several prominent solutions tackle different problems.}, journal = {{ACM} Transactions on Knowledge Discovery from Data {(TKDD)}}, author = {Kriegel, {Hans-Peter} and Kr\"{o}ger, Peer and Zimek, Arthur}, month = mar, year = {2009}, keywords = {algorithms, clustering, data mining, high-dimensional data, subspace clustering, survey, theory}, pages = {1:1{\textendash}1:58} }, @misc{_understanding_2006, type = {{CT316}}, title = {Understanding how Eclipse plug-ins work with {OSGi}}, copyright = {{\textcopyright} Copyright~{IBM} Corporation~2006}, url = {http://www.ibm.com/developerworks/library/os-ecl-osgi/index.html}, month = jun, year = {2006}, keywords = {Eclipse}, howpublished = {http://www.ibm.com/developerworks/library/os-ecl-osgi/index.html} }, @article{widmer_unleashing_2006, title = {Unleashing the Power of Refactoring}, url = {http://www.eclipse.org/articles/article.php?file=Article-Unleashing-the-Power-of-Refactoring/index.html}, journal = {Eclipse Magazine}, author = {Widmer, Tobias}, month = jul, year = {2006}, keywords = {Eclipse, refactoring}, annote = {Describes how some of the specific Eclipse refactoring {APIs} are used within an example. Relevance: 5 } }, @inproceedings{marcus_conceptual_2005, title = {The Conceptual Cohesion of Classes}, isbn = {0-7695-2368-4}, url = {http://portal.acm.org/citation.cfm?id=1091849}, abstract = {While often defined in informal ways, software cohesion reflects important properties of modules in a software system. Cohesion measurement has been used for quality assessment, fault proneness prediction, software modularization, etc. Existing approaches to cohesion measurement in {Object-Oriented} software are largely based on the structural information of the source code, such as attribute references in methods. These measures reflect particular interpretations of cohesion and try to capture different aspects of cohesion and no single cohesion metric or suite is accepted as standard measurement for cohesion. The paper proposes a new set of measures for the cohesion of individual classes within an {OO} software system, based on the analysis of the semantic information embedded in the source code, such as comments and identifiers. A case study on open source software is presented, which compares the new measures with an extensive set of existing metrics. The differences and similarities among the approaches and results are discussed and analyzed.}, booktitle = {Proceedings of the 21st {IEEE} International Conference on Software Maintenance}, publisher = {{IEEE} Computer Society}, author = {Marcus, Andrian and Poshyvanyk, Denys}, year = {2005}, keywords = {cohesion, metrics, semantics}, pages = {133--142} }, @inproceedings{dong_architecture_2007, address = {Las Vegas, Nevada}, title = {Architecture and Design Pattern Discovery Techniques {\textendash} A Review}, volume = {Volume {II}}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.75.3386}, doi = {10.1.1.75.3386}, abstract = {Architecture and design patterns, as demonstrated solutions to recurring problems, have proved practically important and useful in the process of software development. They have been extensively applied in industry. Discovering the instances of architecture and design patterns from the source code of software systems can assist the understanding of the systems and the process of re-engineering. More importantly, it also helps to trace back to the original architecture and design decisions, which are typically missing for legacy systems. This paper presents a review on current techniques and tools for discovering architecture and design patterns from object-oriented systems. We classify different approaches and analyze their results. We also discuss the disparity of the discovery results from different approaches and analyze possible reasons with some insight.}, booktitle = {Proceedings of the 2007 International Conference on Software Engineering Research \& Practice, {SERP} 2007}, author = {Dong, Jing and Zhao, Yajing and Peng, Tu}, month = jun, year = {2007}, keywords = {design patterns, reengineering, survey}, pages = {621--627}, annote = {Has several tables comparing approaches and results. Many of the tools surveyed seem to involve Prolog-like reasoning. Talks about false positives and negatives and the reasons for them, including difficulties in discriminating between concrete classes and abstract ones, determining whether delegation actually occurs, etc. The paper goes into surprising detail in some of its analyses, mentioning specific classes and relationships on some of the packages being analyzed. Relevance: 5} }, @article{patrikainen_comparing_2006, title = {Comparing Subspace Clusterings}, volume = {18}, issn = {1041-4347}, doi = {http://doi.ieeecomputersociety.org/10.1109/TKDE.2006.106}, abstract = {We present the first framework for comparing subspace clusterings. We propose several distance measures for subspace clusterings, including generalizations of well-known distance measures for ordinary clusterings. We describe a set of important properties for any measure for comparing subspace clusterings and give a systematic comparison of our proposed measures in terms of these properties. We validate the usefulness of our subspace clustering distance measures by comparing clusterings produced by the algorithms {FastDOC}, {HARP}, {PROCLUS}, {ORCLUS}, and {SSPC.} We show that our distance measures can be also used to compare partial clusterings, overlapping clusterings, and patterns in binary data matrices.}, number = {7}, journal = {{IEEE} Transactions on Knowledge and Data Engineering}, author = {Patrikainen, Anne and Meila, Marina}, year = {2006}, keywords = {clustering, distance, subspace clustering}, pages = {902--916} }, @inproceedings{briand_unified_1997, title = {A unified framework for cohesion measurement in object-oriented systems}, abstract = {The increasing importance being placed on software measurement has led to an increased amount of research in developing new software measures. Given the importance of object oriented development techniques, one specific area where this has occurred is cohesion measurement in object oriented systems. However despite an interesting body of work, there is little understanding of the motivations and empirical hypotheses behind many of these new measures. It is often difficult to determine how such measures relate to one another and for which application they can be used. As a consequence, it is very difficult for practitioners and researchers to obtain a clear picture of the state of the art in order to select or define cohesion measures for object oriented systems. To help remedy this situation a unified framework, based on the issues discovered in a review of object oriented cohesion measures, is presented. The unified framework contributes to an increased understanding of the state of the art as it is a mechanism for: (i) comparing measures and their potential use, (ii) integrating existing measures which examine the same concepts in different ways, and (iii) facilitating more rigorous decision making regarding the definition of new measures and the selection of existing measures for a specific goal of measurement}, booktitle = {Proceedings of the Fourth International Software Metrics Symposium}, author = {Briand, {L.C.} and Daly, {J.W.} and Wust, J.}, year = {1997}, keywords = {cohesion, metrics, survey}, pages = {43--53}, annote = {Includes some classifications of different kinds of cohesion - temporal, procedural, sequential, ... Also mentions issues distinguishing various metrics - direct vs. indirect attribute access, graph connectivity, whether inheritance is considered. There is also mention of how constructors and accessors can perturb the various metrics. Relevance: 4} }, @incollection{gessel_maximal_2009, address = {Boston, {MA}}, title = {Maximal Flow Through a Network}, isbn = {978-0-8176-4841-1}, url = {http://www.springerlink.com/content/n18653l383436x61/}, booktitle = {Classic Papers in Combinatorics}, publisher = {Birkh\"{a}user Boston}, author = {Ford, L. R. and Fulkerson, D. R.}, editor = {Gessel, Ira and Rota, {Gian-Carlo}}, year = {2009}, keywords = {clustering, graphs}, pages = {243--248} }, @book{mens_software_2008, address = {New York, {NY}, {USA}}, title = {Software Evolution}, publisher = {{Springer-Verlag}}, editor = {Mens, Tom and Demeyer, Serge}, month = mar, year = {2008}, keywords = {clone detection, clones, design patterns, maintenance}, annote = {Only two chapters are particularly relevant - {"Identifying} and Removing Software Clones" and {"Object-Oriented} Reengineering." There are better sources on these topics elsewhere. Relevance: 3} }, @misc{center_for_history_and_new_media_zotero_????, title = {Zotero Quick Start Guide}, url = {http://zotero.org/support/quick_start_guide}, author = {Center for History and New Media}, howpublished = {http://zotero.org/support/quick\_start\_guide}, annote = {Welcome to Zotero! View the Quick Start Guide to learn how to begin collecting, managing, citing, and sharing your research sources. Thanks for installing Zotero.} }, @inproceedings{lungu_exploring_2007, title = {Exploring {Inter-Module} Relationships in Evolving Software Systems}, isbn = {0-7695-2802-3}, url = {http://portal.acm.org/citation.cfm?id=1252777}, abstract = {Many of the existing approaches to reverse architecting "the reverse engineering of the architecture of software systems" are based on software exploration tools which provide interactive ways of exploring the system. These tools start with high-level views of the system and refine them with drill-down techniques applied on the high-level entities such as modules and packages, leaving aside valuable information contained in the dependencies between them. In this article we argue that the visualization of inter-module relations bears great potential for supporting the understanding of large evolving software systems. We present two concrete examples of such visualizations. The first, The Semantic Dependency Matrix is a technique for displaying details about a dependency between two modules which groups together classes with similar behavior. The second, The Edge Evolution Filmstrip presents the evolution of an inter-module relation through multiple versions of the system. Based on our experience with the Edge Evolution Film Strip, we propose a pattern language for inter-module relationships. We exemplify both the visualizations and the pattern language with examples from two large open source software systems.}, booktitle = {Proceedings of the 11th European Conference on Software Maintenance and Reengineering}, publisher = {{IEEE} Computer Society}, author = {Lungu, Mircea and Lanza, Michele}, year = {2007}, keywords = {reverse engineering, visualization}, pages = {91--102} }, @inproceedings{gui_coupling_2006, address = {Shanghai, China}, series = {{MSR} '06}, title = {Coupling and cohesion measures for evaluation of component reusability}, isbn = {1-59593-397-2}, doi = {10.1145/1137983.1137989}, abstract = {This paper provides an account of new measures of coupling and cohesion developed to assess the reusability of Java components retrieved from the internet by a search engine. These measures differ from the majority of established metrics in two respects: they reflect the degree to which entities are coupled or resemble each other, and they take account of indirect couplings or similarities. An empirical comparison of the new measures with eight established metrics shows the new measures are consistently superior at ranking components according to their reusability.}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, publisher = {{ACM}}, author = {Gui, G. and Scott, P. D}, year = {2006}, note = {{ACM} {ID:} 1137989}, keywords = {Cohesion, complexity measures, Coupling, experimentation, measurement, reusability}, pages = {18{\textendash}21} }, @inproceedings{zhang_birch:_1996, address = {New York, {NY}, {USA}}, title = {{BIRCH:} an efficient data clustering method for very large databases}, volume = {25}, shorttitle = {{BIRCH}}, doi = {10.1145/235968.233324}, abstract = {Finding useful patterns in large datasets has attracted considerable interest recently, and one of the most widely studied problems in this area is the identification of clusters, or densely populated regions, in a multi-dimensional dataset. Prior work does not adequately address the problem of large datasets and minimization of {I/O} {costs.This} paper presents a data clustering method named {BIRCH} {(Balanced} Iterative Reducing and Clustering using Hierarchies), and demonstrates that it is especially suitable for very large databases. {BIRCH} incrementally and dynamically clusters incoming multi-dimensional metric data points to try to produce the best quality clustering with the available resources (i.e., available memory and time constraints). {BIRCH} can typically find a good clustering with a single scan of the data, and improve the quality further with a few additional scans. {BIRCH} is also the first clustering algorithm proposed in the database area to handle "noise" (data points that are not part of the underlying pattern) {effectively.We} evaluate {BIRCH's} time/space efficiency, data input order sensitivity, and clustering quality through several experiments. We also present a performance comparisons of {BIRCH} versus {CLARANS}, a clustering method proposed recently for large datasets, and show that {BIRCH} is consistently superior.}, booktitle = {{ACM} {SIGMOD} Record}, publisher = {{ACM}}, author = {Zhang, Tian and Ramakrishnan, Raghu and Livny, Miron}, month = jun, year = {1996}, note = {{ACM} {ID:} 233324}, keywords = {algorithms, clustering, measurement, performance, theory}, pages = {103{\textendash}114} }, @book{duda_pattern_2000, title = {Pattern Classification (2nd Edition)}, isbn = {0471056693}, url = {http://portal.acm.org/citation.cfm?id=954544&dl=GUIDE&coll=GUIDE&CFID=13704865&CFTOKEN=30535523}, publisher = {{Wiley-Interscience}}, author = {Duda, Richard O. and Hart, Peter E. and Stork, David G.}, year = {2000}, keywords = {pattern matching, similarity}, annote = {Great book.} }, @misc{_zotero_????, title = {Zotero - Quick Start Guide}, url = {http://www.zotero.org/documentation/quick_start_guide} }, @article{bar-yossef_cluster_2008, title = {Cluster ranking with an application to mining mailbox networks}, volume = {14}, url = {http://dx.doi.org/10.1007/s10115-007-0096-0}, doi = {10.1007/s10115-007-0096-0}, abstract = {We initiate the study of a new clustering framework, called cluster ranking. Rather than simply partitioning a network into clusters, a cluster ranking algorithm also orders the clusters by their strength. To this end, we introduce a novel strength measure for clusters{\textemdash}the integrated cohesion{\textemdash}which is applicable to arbitrary weighted networks. We then present a new cluster ranking algorithm, called {C-Rank.} We provide extensive theoretical and empirical analysis of {C-Rank} and show that it is likely to have high precision and recall. A main component of {C-Rank} is a heuristic algorithm for finding sparse vertex separators. At the core of this algorithm is a new connection between vertex betweenness and multicommodity flow. Our experiments focus on mining mailbox networks. A mailbox network is an egocentric social network, consisting of contacts with whom an individual exchanges email. Edges between contacts represent the frequency of their co{\textendash}occurrence on message headers. {C-Rank} is well suited to mine such networks, since they are abundant with overlapping communities of highly variable strengths. We demonstrate the effectiveness of {C-Rank} on the Enron data set, consisting of 130 mailbox networks.}, number = {1}, journal = {Knowledge and Information Systems}, author = {{Bar-Yossef}, Ziv and Guy, Ido and Lempel, Ronny and Maarek, Yo\"{e}lle and Soroka, Vladimir}, month = jan, year = {2008}, keywords = {clustering, graph algorithms, graphs}, pages = {101--139} }, @article{caserta_visualization_2010, title = {Visualization of the Static Aspects of Software: A Survey}, volume = {99}, issn = {1077-2626}, shorttitle = {Visualization of the Static Aspects of Software}, doi = {http://doi.ieeecomputersociety.org/10.1109/TVCG.2010.110}, abstract = {Software is usually complex and always intangible. In practice, the development and maintenance processes are time-consuming activities mainly because software complexity is difficult to manage. Graphical visualization of software has the potential to result in a better and faster understanding of its design and functionality, saving time and providing valuable information to improve its quality. However, visualizing software is not an easy task because of the huge amount of information comprised in the software. Furthermore, the information content increases significantly once the time dimension to visualize the evolution of the software is taken into account. Human perception of information and cognitive factors must thus be taken into account to improve the understandability of the visualization. In this paper, we survey visualization techniques, both {2D-} and {3D-based}, representing the static aspects of the software and its evolution. We categorize these techniques according to the issues they focus on, in order to help compare them and identify the most relevant techniques and tools for a given problem.}, number = {1}, journal = {{IEEE} Transactions on Visualization and Computer Graphics}, author = {Caserta, Pierre and Zendra, Olivier}, month = aug, year = {2010}, keywords = {case, graphical environments, metrics, survey, visualization} }, @article{miller_magical_1956, title = {The magical number seven, plus or minus two: some limits on our capacity for processing information.}, volume = {63}, issn = {{0033-295X}}, shorttitle = {The magical number seven, plus or minus two}, abstract = {A variety of researches are examined from the standpoint of information theory. It is shown that the unaided observer is severely limited in terms of the amount of information he can receive, process, and remember. However, it is shown that by the use of various techniques, e.g., use of several stimulus dimensions, recoding, and various mnemonic devices, this informational bottleneck can be broken. 20 references. {(PsycINFO} Database Record (c) 2010 {APA}, all rights reserved)}, number = {2}, journal = {Psychological Review. Vol 63(2)}, author = {Miller, George A}, month = mar, year = {1956}, keywords = {information theory}, pages = {81--97} }, @inproceedings{wettel_visually_2008, address = {Ammersee, Germany}, title = {Visually localizing design problems with disharmony maps}, isbn = {978-1-60558-112-5}, url = {http://portal.acm.org/citation.cfm?id=1409720.1409745}, doi = {10.1145/1409720.1409745}, abstract = {Assessing the quality of software design is difficult, as "design" is expressed through guidelines and heuristics, not rigorous rules. One successful approach to assess design quality is based on detection strategies, which are metrics-based composed logical conditions, by which design fragments with specific properties are detected in the source code. Such detection strategies, when executed on large software systems usually return large sets of artifacts, which potentially exhibit one or more "design disharmonies", which are then inspected manually, a cumbersome activity.}, booktitle = {Proceedings of the 4th {ACM} Symposium on Software Visualization}, publisher = {{ACM}}, author = {Wettel, Richard and Lanza, Michele}, year = {2008}, keywords = {smells, visualization}, pages = {155--164} }, @article{badri_revisiting_2008, title = {Revisiting Class Cohesion: an empirical investigation on several systems}, volume = {7}, shorttitle = {Revisiting Class Cohesion}, abstract = {Class cohesion is considered as one of most important object-oriented software attributes. Cohesion refers to the degree of relatedness between members in a class. High cohesion is a desirable property of classes. Several metrics have been proposed in literature in order to measure class cohesion in object-oriented systems. They capture class cohesion in terms of connections between members within a class. Most of these metrics have been experimented and widely discussed. They do not take into account some characteristics of classes as stated in several papers. We present, in this paper, an extention of the cohesion metric we proposed in a previous work. We introduce a new cohesion criterion based on common objects parameters. Our main goal in this work was: (1) to demonstrate, by analyzing many real systems that the introduced criterion is statistically significant and, (2) to validate our approach for class cohesion assessment by exploring empirically the relationship that may exist between our new cohesion metric and coupling. We developed a cohesion measurement tool for Java programs and performed an empirical study on several systems. The selected test systems vary in size and domain. The obtained results demonstrate that: (1) the new class cohesion metric captures several additional pairs of related methods and (2) there exists a significant correlation between the new cohesion metric and coupling.}, number = {6}, journal = {Journal Of Object Technology}, author = {Badri, Linda and Badri, Mourad and Gueye, Alioune}, month = aug, year = {2008}, keywords = {cohesion, empirical, metrics}, annote = {Just measurements on their own metrics. Relevance: 1} }, @article{turney_frequency_2010, title = {From frequency to meaning: Vector space models of semantics}, volume = {37}, issn = {1076-9757}, abstract = {Computers understand very little of the meaning of human language. This profoundly limits our ability to give instructions to computers, the ability of computers to explain their actions to us, and the ability of computers to analyse and process text. Vector space models {(VSMs)} of semantics are beginning to address these limits. This paper surveys the use of {VSMs} for semantic processing of text. We organize the literature on {VSMs} according to the structure of the matrix in a {VSM.} There are currently three broad classes of {VSMs}, based on term{\textendash}document, word{\textendash}context, and pair{\textendash}pattern matrices, yielding three classes of applications. We survey a broad range of applications in these three categories and we take a detailed look at a specific open source project in each category. Our goal in this survey is to show the breadth of applications of {VSMs} for semantics, to provide a new perspective on {VSMs} for those who are already familiar with the area, and to provide pointers into the literature for those who are less familiar with the field.}, number = {1}, journal = {Journal of Artificial Intelligence Research}, author = {Turney, Peter and Pantel, Patrick}, year = {2010}, keywords = {semantics}, pages = {141{\textendash}188} }, @article{poels_distance-based_2000, title = {Distance-based software measurement: necessary and sufficient properties for software measures}, volume = {42}, issn = {0950-5849}, shorttitle = {Distance-based software measurement}, url = {http://www.sciencedirect.com/science/article/B6V0B-3Y21WCF-4/2/d556ae5183ca304cc5ebf5b8d7f49ffb}, doi = {10.1016/S0950-5849(99)00053-1}, abstract = {Axiomatic approaches to software measurement present sets of necessary, but not sufficient measure axioms. The insufficiency of the measure axioms implies that they are useful to invalidate existing software measures, but not to validate them. In this paper, a set of measure axioms is presented whose sufficiency is guaranteed by measurement theory. The axioms referred to are the metric axioms, used in mathematics to define measures of distance. We present a constructive procedure that defines software measures satisfying these axioms. As an illustration of distance-based software measurement, a measure is defined for the aggregation coupling of object classes.}, number = {1}, journal = {Information and Software Technology}, author = {Poels, G. and Dedene, G.}, month = jan, year = {2000}, keywords = {measurement theory, metrics}, pages = {35--46} }, @book{borgida_principles_1991, title = {Principles of semantic networks: explorations in the representation of knowledge}, isbn = {9781558600881}, shorttitle = {Principles of semantic networks}, publisher = {Morgan Kaufmann}, author = {Borgida, Alexander and Sowa, John F.}, year = {1991}, keywords = {Artificial intelligence, semantic networks, Semantics} }, @inproceedings{akers_reengineering_2005, address = {Washington, {DC}, {USA}}, title = {Reengineering C++ Component Models via Automatic Program Transformation}, isbn = {0-7695-2474-5}, doi = {http://dx.doi.org/10.1109/WCRE.2005.25}, booktitle = {{WCRE} '05: Proceedings of the 12th Working Conference on Reverse Engineering}, publisher = {{IEEE} Computer Society}, author = {Akers, Robert L. and Baxter, Ira D. and Mehlich, Michael and Ellis, Brian J. and Luecke, Kenn R.}, year = {2005}, keywords = {clones, refactoring}, pages = {13{\textemdash}22}, annote = {{AST-to-AST} rewriting engine, ability to specify and apply source-to-source program transformations based on language syntax, ties rules to grammar elements {DMS} has been under development for nine years. As presently constituted, it has been used for a variety of large scale commercial activities, including crossplatform migrations, domain-specific code generation, and construction of a variety of conventional software engineering tools implementing tasks such as dead/clone code elimination, test code coverage, execution profiling, source code browsing, and static metrics analysis. Relevance: 5} }, @article{garcia_assessment_2007, title = {Assessment of Contemporary Modularization Techniques - {ACoM'07:} workshop report}, volume = {32}, issn = {0163-5948}, shorttitle = {Assessment of Contemporary Modularization Techniques - {ACoM'07}}, url = {http://dx.doi.org/10.1145/1290993.1291005}, abstract = {The effective assessment of emerging modularization technologies plays a pivotal role on: (i) a better understanding of their real benefits and drawbacks when compared to conventional development techniques, and (ii) their effective transfer to mainstream software development. This report is intended to summarize the results of the 1st International Workshop on Assessment of Contemporary Modularization Techniques {(ACoM{\textquoteright}07)} held in Minneapolis, {USA}, May 22, 2007, as part of the 29th International Conference on Software Engineering {(ICSE{\textquoteright}07).} The main purpose of this workshop was to share and pool the collective experience of people interested in and actively working on assessment of innovative modularization techniques. The workshop consisted of an opening presentation, several paper presentations organized into three technical sessions, and four discussion groups. During the workshop presentations and discussions, the authors and participants directly and indirectly reviewed ongoing and previous work and debated a number of important issues on contemporary modularity assessment. The {ACoM{\textquoteright}07} website, including the electronic version of this report, can be found at {{\textless}www.comp.lancs.ac.uk/computing/ACoM.07/{\textgreater}.} We begin by presenting an overview of our goals and the workshop structure, and then focus on the workshop technical program and results.}, number = {5}, journal = {{SIGSOFT} Softw. Eng. Notes}, author = {Garcia, Alessandro and Greenwood, Phil and Heineman, George and Walker, Robert and Cai, Yuanfang and Yang, Hong and Baniassad, Elisa and Lopes, Cristina and Schwanninger, Christa and Zhao, Jianjun}, year = {2007}, keywords = {aspects, metrics}, pages = {37, 31}, annote = {Discusses the happenings at a modularization workshop. The pointers to researchers and issues is somewhat useful, but there isn't too much content here. Relevance: 2} }, @article{tsang_glimpse_1999, title = {A Glimpse of Constraint Satisfaction}, volume = {13}, url = {http://portal.acm.org/citation.cfm?id=317257}, abstract = {Constraint satisfaction has become an important field in computer science. This technology is embedded in millions of pounds of software used by major companies. Many researchers or software engineers in the industry could have benefited from using constraint technology without realizing it. The aim of this paper is to promote constraint technology by providing readers with a fairly quick introduction to this field. The approach here is to use the well known 8-queens problem to illustrate the basic techniques in constraint satisfaction (without going into great details), and leave interested readers with pointers to further study this field.}, number = {3}, journal = {Artificial Intelligence Review}, author = {Tsang, Edward}, year = {1999}, keywords = {constraint satisfaction, search}, pages = {215--227}, annote = {This is an extremely clear, well-written introductory paper. It talks about constraint programming techniques, including search and repair strategies, lookahead and back-jumping. Relevance: 3} }, @inproceedings{lee_measuring_1995, address = {Maribor, Slovenia}, title = {Measuring the coupling and cohesion of an object-oriented program based on information flow}, booktitle = {Proc. International Conference of Software Quality}, author = {Lee, {Y.-S.} and Liang, {B.-S.} and Wu, {S.-F.} and Wang, {F.-J.}}, year = {1995} }, @book{pfleeger_software_2009, title = {Software Engineering: Theory and Practice}, isbn = {9780136061694}, shorttitle = {Software Engineering}, publisher = {Prentice Hall}, author = {Pfleeger, Shari Lawrence and Atlee, Joanne M.}, month = feb, year = {2009} }, @book{rich_programmers_1990, title = {Programmer's Apprentice}, isbn = {0201524252}, lccn = {{QA76.76} D47 R498 P}, publisher = {Association for Computing Machinery}, author = {Rich, Charles and Waters, Richard C.}, month = mar, year = {1990}, keywords = {kbs}, annote = {Includes a chapter on {"Automated} Cliche Recognition", which describes their techniques for converting code into flow graphs and comparing the graphs. This chapter also talks about learning new cliches. Relevance: 3 } }, @incollection{misra_changing_2006, address = {Berlin, Heidelberg}, title = {Changing Programs Correctly: Refactoring with Specifications}, volume = {4085}, isbn = {978-3-540-37215-8, 978-3-540-37216-5}, shorttitle = {Changing Programs Correctly}, url = {http://www.springerlink.com/content/3x18gk1047g04652/}, booktitle = {{FM} 2006: Formal Methods}, publisher = {Springer Berlin Heidelberg}, author = {Bannwart, Fabian and M\"{u}ller, Peter}, editor = {Misra, Jayadev and Nipkow, Tobias and Sekerinski, Emil}, year = {2006}, keywords = {Formal methods, refactoring}, pages = {492--507} }, @article{al_dallal_measuring_2010, title = {Measuring the discriminative power of object-oriented class cohesion metrics}, volume = {{PP}}, issn = {0098-5589}, doi = {10.1109/TSE.2010.97}, abstract = {Several object-oriented cohesion metrics have been proposed in the literature. These metrics aim to measure the relationship between class members, namely methods and attributes. Different metrics use different models to represent the connectivity pattern of cohesive interactions {(CPCI)} between class members. Most of these metrics are normalized to allow for easy comparison of the cohesion of different classes. However, in some cases, these metrics obtain the same cohesion values for different classes that have the same number of methods and attributes but different {CPCIs.} This leads to incorrectly considering the classes to be the same in terms of cohesion, even though their {CPCIs} clearly indicate that the degrees of cohesion are different. We refer to this as a lack of discrimination anomaly {(LDA)} problem. In this paper, we list and discuss cases in which the {LDA} problem exists, as expressed through the use of fifteen cohesion metrics. In addition, we empirically study the frequent occurrence of the {LDA} problem when the considered metrics are applied to classes in five open source Java systems. Finally, we propose a metric and a simulation-based methodology to measure the discriminative power of cohesion metrics.}, number = {99}, journal = {{IEEE} Transactions on Software Engineering}, author = {Al Dallal, J.}, month = nov, year = {2010}, keywords = {Object oriented modeling, object-oriented programming, Phase measurement, Power measurement, Sensitivity, Software, Software measurement, Software {Quality/SQA}, Software science}, pages = {1--1} }, @misc{_jhotdraw_2007, title = {{JHotDraw} Start Page}, url = {http://www.jhotdraw.org/}, year = {2007}, note = {Accessed 2011-10-03}, howpublished = {http://www.jhotdraw.org/} }, @inproceedings{carey_recovering_2007, title = {Recovering Concepts from Source Code with Automated Concept Identification}, isbn = {1063-6897}, doi = {10.1109/ICPC.2007.31}, abstract = {The complexity of the systems that software engineers build has continuously grown since the inception of the field. What has not changed is the engineers' mental capacity to operate on about seven distinct pieces of information at a time. Improvements like the widespread use of {UML} have led to more abstract software design activities, however the same cannot be said for reverse engineering activities. The well known concept assignment problem is still being solved at the line-by-line level of analyzing source code. The introduction of abstraction to the problem will allow the engineer to move farther away from the details of the system, increasing his ability to see the role that domain level concepts play in the system. In this paper we present a technique that facilitates filtering of classes from existing systems at the source level based on their relationship to the core concepts in the domain. This approach can simplify the process of reverse engineering and design recovery, as well as other activities that require a mapping to domain level concepts.}, booktitle = {Program Comprehension, 2007. {ICPC} '07. 15th {IEEE} International Conference on}, author = {Carey, {M.M.} and Gannod, {G.C.}}, year = {2007}, keywords = {automated concept identification, clustering, metrics, reverse engineering, semantic networks, {UML}}, pages = {27--36} }, @article{fruchterman_graph_1991, title = {Graph drawing by force-directed placement}, volume = {21}, issn = {{1097-024X}}, url = {http://onlinelibrary.wiley.com/doi/10.1002/spe.4380211102/abstract}, doi = {10.1002/spe.4380211102}, abstract = {We present a modification of the spring-embedder model of Eades {[Congressus} Numerantium, 42, 149{\textendash}160, (1984)] for drawing undirected graphs with straight edges. Our heuristic strives for uniform edge lengths, and we develop it in analogy to forces in natural systems, for a simple, elegant, conceptually-intuitive, and efficient algorithm.}, number = {11}, journal = {Software: Practice and Experience}, author = {Fruchterman, Thomas M. J and Reingold, Edward M}, month = nov, year = {1991}, keywords = {Force-directed placement, Graph drawing, Multi-level techniques, Simulated annealing}, pages = {1129--1164} }, @techreport{czibula_study_2007, address = {{Cluj-Napoca}, Romania}, type = {Technical Report}, title = {A Study on Clustering Based Restructuring of {Object-Oriented} Software Systems}, abstract = {The structure of a software system has a major impact on its maintainability. Refactoring is an activity performed through the entire lifecycle of a software system in order to keep the software structure clean and easy to maintain. We have previously introduced in [3] a clustering approach for identifying refactorings in order to improve the structure of software systems. The aim of this paper is to make a comparative analysis on several clustering algorithms (developed based on the approach from [3]) which can be used in order to recondition the class structure of a software system. Based on this analysis, we highlight the advantages of determining refactorings of object-oriented software systems using clustering.}, number = {Informatica, Volume {LII}, Number 2}, institution = {Studia Univ. {Babes-Bolyai}}, author = {Czibula, {Istv\'{a}n-Gergely} and Serban, Gabriela}, year = {2007}, keywords = {clustering, refactoring, software clustering} }, @inproceedings{czibula_clustering_2008, title = {Clustering Based Automatic Refactorings Identification}, doi = {10.1109/SYNASC.2008.17}, abstract = {The aim of this paper is to approach the problem of improving the design of an object oriented software system, by identifying the appropriate refactorings. It is well known that improving the quality of software systems design is an important issue during the evolution of object oriented software systems. This improvement can be achieved by refactoring the software system in order to improve its internal structure, but without altering the external behavior of the code. In this paper we introduce a hierarchical divisive clustering algorithm for automatic identification of refactorings that improve the internal structure of a software system. We evaluate our approach using {JHotDraw} case study and a real software system, emphasizing its advantages in comparison with existing similar approaches.}, booktitle = {Symbolic and Numeric Algorithms for Scientific Computing, 2008. {SYNASC} '08. 10th International Symposium on}, author = {Czibula, {I.G.} and Czibula, G.}, year = {2008}, keywords = {clustering, refactoring, software clustering}, pages = {253--256} }, @misc{cassell_ext-c_2010, title = {ext-c - Project Hosting on Google Code}, url = {http://code.google.com/p/ext-c/}, author = {Cassell, Keith}, year = {2010}, howpublished = {http://code.google.com/p/ext-c/} }, @book{yourdon_structured_1979, title = {Structured Design: Fundamentals of a Discipline of Computer Program and Systems Design}, isbn = {0138544719}, shorttitle = {Structured Design}, url = {http://portal.acm.org/citation.cfm?id=578522}, abstract = {From the {Publisher:A} valuable new approach to computer systems and program design, structured design is quickly becoming the standard industrial technique for significantly improving productivity, enhancing reliability, and lowering maintenance costs.}, publisher = {{Prentice-Hall}, Inc.}, author = {Yourdon, Edward and Constantine, Larry L.}, year = {1979} }, @article{gower_minimum_1969, title = {Minimum Spanning Trees and Single Linkage Cluster Analysis}, volume = {18}, issn = {0035-9254}, url = {http://www.jstor.org/stable/2346439}, doi = {10.2307/2346439}, abstract = {Minimum spanning trees {(MST)} and single linkage cluster analysis {(SLCA)} are explained and it is shown that all the information required for the {SLCA} of a set of points is contained in their {MST.} Known algorithms for finding the {MST} are discussed. They are efficient even when there are very many points; this makes a {SLCA} practicable when other methods of cluster analysis are not. The relevant computing procedures are published in the Algorithm section of the same issue of Applied Statistics. The use of the {MST} in the interpretation of vector diagrams arising in multivariate analysis is illustrated by an example.}, number = {1}, journal = {Journal of the Royal Statistical Society. Series C {(Applied} Statistics)}, author = {Gower, J. C. and Ross, G. J. S.}, month = jan, year = {1969}, keywords = {clustering, graphs}, pages = {54--64} }, @book{rumbaugh_omt_1996, title = {{OMT} insights}, isbn = {0138469652, 9780138469658}, publisher = {Cambridge University Press}, author = {Rumbaugh, James}, year = {1996}, keywords = {{OOD}} }, @article{canfora_achievements_2011, title = {Achievements and challenges in software reverse engineering}, volume = {54}, issn = {0001-0782}, abstract = {Deeply understanding the intricacies of software must always come before any considerations for modifying it.}, journal = {Communications of the {ACM}}, author = {Canfora, Gerardo and Di Penta, Massimiliano and Cerulo, Luigi}, month = apr, year = {2011}, note = {{ACM} {ID:} 1924451}, keywords = {management, software management}, pages = {142{\textendash}151} }, @misc{vijaya_text_2010, title = {A Text mining approach for Measuring Conceptual Cohesion of Classes in Object Oriented Systems}, url = {http://www.scribd.com/doc/29324585/A-text-mining-approach-for-conceptual-cohesion-of-classes-in-object-oriented-systems}, abstract = {In Object Oriented Software systems cohesion reflects important property as it impacts understanding, reuse and maintenance. This cohesion measurement has been used for quality assessment, fault proneness prediction, software modularization, etc. Currently proposed measures for cohesion in {OO} software reflect particular interpretations of cohesion and capture different aspects of cohesion and no single cohesion metric or suite is accepted as standard measurement for cohesion. Existing approaches are largely based on using the structural information from the source code, such as attribute references, in methods to measure cohesion. This paper proposes a new set of measures for the cohesion of individual classes within an {OO} software system, based on the analysis of the unstructured information embedded in the source code, such as comments and identifiers. This proposed metric uses the text mining approach namely latent semantic indexing technique which measures the semantic similarity between any two texts in an automatic unsupervised way.}, author = {Vijaya, N. and Sumathi, R.}, month = apr, year = {2010} }, @misc{_jena_2010, title = {Jena Semantic Web Framework}, url = {http://jena.sourceforge.net/}, year = {2010}, note = {Accessed 2011-10-03}, howpublished = {http://jena.sourceforge.net/} }, @phdthesis{yang_measuring_2010, type = {{PhD} Thesis}, title = {Measuring Indirect Coupling}, abstract = {There is an increasing awareness on the importance of software measurement within the software engineering community, as well as the necessity of respecting the scientific basis of measurement. However there is little evidence for the latter as there is a tendency for researchers and practitioners to apply software metrics without a full awareness of what they mean. Coupling, which is the measure of the interdependence between parts of a software system (e.g. classes), is one important property for which many metrics have been defined. While it is widely agreed that there is a relationship between high coupling and poor maintainability, I argue that current empirical evidence toward this is insufficient to promote a full understanding of this relationship. Part of this is due to the lack of coverage of all forms of connections that comprise coupling. To illustrate this I identify a specific, indirect, form of coupling that manifests between two seemingly unrelated parts of the system through hidden connections. My thesis is that there is a relationship between indirect coupling and maintainability. To gather evidence for this I follow a methodology based on the philosophies of key software metrics researchers. This involves operationally defining indirect coupling so that it can be accurately measured, establishing an explanatory model as to the relationship between indirect coupling and maintainability, and finally empirically corroborating this model.}, school = {The University of Auckland New Zealand}, author = {Yang, Hong Yul}, year = {2010}, keywords = {coupling, measurement theory, metrics, metrics validation, thesis}, annote = {Contains good descriptions of metrics, measurement theory, and validation. See section 2.7 and surroundings. Section 5.3.1 Validations of coupling and complexity metrics} }, @inproceedings{benlarbi_thresholds_2000, title = {Thresholds for object-oriented measures}, isbn = {0-7695-0807-3}, url = {http://portal.acm.org/citation.cfm?id=856210}, abstract = {A practical application of object-oriented measures is to predict which classes are likely to contain a fault. This is contended to be meaningful because object-oriented measures are believed to be indicators of psychological complexity, and classes that are more complex are likely to be {faultyRecently}, a cognitive theory has been proposed suggesting that there are threshold effects for many object-oriented measures. This means that object-oriented classes are easy to understand as long as their complexity is below a threshold. Above that threshold, their understandability decreases rapidly, leading to an increased probability of a fault. This occurs, according to the theory, due to an overflow of short-term human memory. If this theory were confirmed, then it would provide a mechanism that would explain the introduction of faults into object-oriented systems, and would provide some practical guidance on how to design object-oriented programs. In this paper, we empirically test this theory on two C++ telecommunications {systems.We} test for threshold effects in a subset of the Chidamber and Kemerer {(CK)} suite of measures. The dependent variable was the incidence of faults that lead to field failures. Our results indicate that there are no threshold effects for any of the measures studied. This means that there is no value for the studied {CK} measures where the fault-proneness changes from being steady to rapidly increasing. The results are consistent across the two systems. Therefore, we can provide no support to the posited cognitive theory.}, booktitle = {Proceedings of the 11th International Symposium on Software Reliability Engineering}, publisher = {{IEEE} Computer Society}, author = {Benlarbi, Saida and Emam, Khaled El and Goel, Nishith and Rai, Shesh}, year = {2000}, keywords = {empirical, metrics, size}, pages = {24} }, @article{chidamber_managerial_1998, title = {Managerial use of metrics for object-oriented software: an exploratory analysis}, volume = {24}, issn = {0098-5589}, shorttitle = {Managerial Use of Metrics for {Object-Oriented} Software}, doi = {http://doi.ieeecomputersociety.org/10.1109/32.707698}, abstract = {{Abstract{\textemdash}With} the increasing use of object-oriented methods in new software development there is a growing need to both document and improve current practice in object-oriented design and development. In response to this need, a number of researchers have developed various metrics for object-oriented systems as proposed aids to the management of these systems. In this research an analysis of a set of metrics proposed by Chidamber and Kemerer [10] is performed in order to assess their usefulness for practicing managers. First, an informal introduction to the metrics is provided by way of an extended example of their managerial use. Second, exploratory analyses of empirical data relating the metrics to productivity, rework effort, and design effort on three commercial object-oriented systems are provided. The empirical results suggest that the metrics provide significant explanatory power for variations in these economic variables, over and above that provided by traditional measures, such as size in lines of code, and after controlling for the effects of individual developers.}, number = {8}, journal = {{IEEE} Transactions on Software Engineering}, author = {Chidamber, Shyam R. and Darcy, David P. and Kemerer, Chris F.}, year = {1998}, keywords = {effort, empirical, metrics, productivity, project management, reuse}, pages = {629--639} }, @inproceedings{reichenbach_program_2009, address = {Italy}, title = {Program Metamorphosis}, isbn = {978-3-642-03012-3}, url = {http://portal.acm.org/citation.cfm?id=1615184.1615209}, abstract = {Modern development environments support refactoring by providing atomically behaviour-preserving transformations. While useful, these transformations are limited in three ways: (i) atomicity forces transformations to be complex and opaque, (ii) the behaviour preservation requirement disallows deliberate behaviour evolution, and (iii) atomicity limits code reuse opportunities for refactoring implementers.}, booktitle = {Proceedings of the 23rd European Conference on {ECOOP} 2009 --- {Object-Oriented} Programming}, publisher = {{Springer-Verlag}}, author = {Reichenbach, Christoph and Coughlin, Devin and Diwan, Amer}, year = {2009}, keywords = {program evolution, refactoring}, pages = {394--418} }, @inproceedings{calliss_knowledge-based_1988, title = {A {Knowledge-Based} System for Software Maintenance}, booktitle = {in Proceedings for the Conference on Software Maintenance}, author = {Calliss, F. W. and Khalil, M. and Munro, M. and Ward, M.}, year = {1988}, keywords = {kbs, maintenance, refactoring, transformation}, pages = {319{\textemdash}324}, annote = {Expert system for C code analysis and transformation requires knowledge elicitation from expert maintainers and domain experts. Semantics-based using program "plans" {"A} catalogue of proven transformations..." Relevance: 4} }, @book{gosling_javatm_2005, edition = {3}, title = {{Java(TM)} Language Specification, The}, isbn = {0321246780}, publisher = {Addison Wesley}, author = {Gosling, James and Joy, Bill and Steele, Guy and Bracha, Gilad}, month = jun, year = {2005} }, @misc{_heritrix_2011, title = {Heritrix - Home Page}, url = {http://crawler.archive.org/}, month = jun, year = {2011}, note = {Accessed 2011-10-03}, howpublished = {http://crawler.archive.org/} }, @article{maekelae_client-based_2009, title = {Client-based cohesion metrics for Java programs}, volume = {74}, url = {http://portal.acm.org/citation.cfm?id=1518652}, abstract = {One purpose of software metrics is to measure the quality of programs. The results can be for example used to predict maintenance costs or improve code quality. An emerging view is that if software metrics are going to be used to improve quality, they must help in finding code that should be refactored. Often refactoring or applying a design pattern is related to the role of the class to be refactored. In client-based metrics, a project gives the class a context. These metrics measure how a class is used by other classes in the context. We present a new client-based metric {LCIC} {(Lack} of Coherence in Clients), which analyses if the class being measured has a coherent set of roles in the program. Interfaces represent the roles of classes. If a class does not have a coherent set of roles, it should be refactored, or a new interface should be defined for the class. We have implemented a tool for measuring the metric {LCIC} for Java projects in the Eclipse environment. We calculated {LCIC} values for classes of several open source projects. We compare these results with results of other related metrics, and inspect the measured classes to find out what kind of refactorings are needed. We also analyse the relation of different design patterns and refactorings to our metric. Our experiments reveal the usefulness of client-based metrics to improve the quality of code.}, number = {5-6}, journal = {Sci. Comput. Program.}, author = {M\"{a}kel\"{a}, Sami and Lepp\"{a}nen, Ville}, year = {2009}, keywords = {cohesion, design patterns, Eclipse, metrics, refactoring}, pages = {355--378} }, @incollection{tempero_how_2008, title = {How Do Java Programs Use Inheritance? An Empirical Study of Inheritance in Java Software}, shorttitle = {How Do Java Programs Use Inheritance?}, url = {http://dx.doi.org/10.1007/978-3-540-70592-5_28}, abstract = {Inheritance is a crucial part of object-oriented programming, but its use in practice, and the resulting large-scale inheritance structures in programs, remain poorly understood. Previous studies of inheritance have been relatively small and have generally not considered issues such as Java{\textquoteright}s distinction between classes and interfaces, nor have they considered the use of external libraries. In this paper we present the first substantial empirical study of the large-scale use of inheritance in a contemporary {OO} programming language. We present a suite of structured metrics for quantifying inheritance in Java programs. We present the results of performing a corpus analysis using those metrics to over 90 applications consisting of over 100,000 separate classes and interfaces. Our analysis finds higher use of inheritance than anticipated, variation in the use of inheritance between interfaces and classes, and differences between inheritance within application types compared with inheritance from external libraries.}, booktitle = {{ECOOP} 2008 {\textendash} {Object-Oriented} Programming}, author = {Tempero, Ewan and Noble, James and Melton, Hayden}, year = {2008}, keywords = {empirical, inheritance}, pages = {667--691} }, @inproceedings{koschke_survey_2006, title = {Survey of research on software clones}, url = {http://drops.dagstuhl.de/opus/volltexte/2007/962}, booktitle = {Duplication, Redundancy, and Similarity in Software}, author = {Koschke, R.}, year = {2006}, keywords = {clones, survey}, pages = {2007--01} }, @book{rumbaugh_object-oriented_1990, edition = {1st}, title = {{Object-Oriented} Modeling and Design}, isbn = {0136298419}, publisher = {{Prentice-Hall}}, author = {Rumbaugh, James R. and Blaha, Michael R. and Lorensen, William and Eddy, Frederick and Premerlani, William}, month = oct, year = {1990}, keywords = {{OOD}} }, @incollection{serban_object-oriented_2008, address = {Berlin / Heidelberg}, title = {Object-oriented software systems restructuring through clustering}, abstract = {It is well-known that maintenance and evolution represent important stages in the lifecycle of any software system (about 66\% from the total cost of the software systems development). That is why in this paper we are focusing on the problem of automating an essential activity that appears in the maintenance and evolution of software systems: the problem of identifying refactorings that would improve the structure of the system. Refactoring is the process of improving the design of software systems, by improving their internal structure, without altering the external behavior of the code. The aim of this paper is to introduce a new clustering algorithm, {CASYR} {(Clustering} Algorithm for Software Systems Restructuring), that can be used for improving software systems design, by identifying the appropriate refactorings. The proposed approach can be useful for assisting software engineers in their daily work of refactoring software systems. We evaluate our approach on a real software system and we also provide a comparison with previous approaches.}, booktitle = {Artificial Intelligence and Soft Computing - {ICAISC} 2008}, publisher = {{Springer-Verlag}}, author = {Serban, Gabriela and Czibula, {Istv\'{a}n-Gergely}}, year = {2008}, keywords = {clustering, maintenance, move method, refactoring, software clustering}, pages = {693--704}, annote = {Describes agglomerative clustering for class formation. They use average link for group merging, stating that it gives better results. Relevance: 5} }, @inproceedings{berzal_mining_2007, address = {Zaragoza, Spain}, title = {Mining the True Structure of Software}, isbn = {978-84-9732-602-5}, abstract = {When mining complex data, choosing the right representation for the underlying data is key for the practical application of data mining techniques. In the case of software systems, many program representations have already been proposed in the literature to be used by compilers and other software development tools. In this paper, we propose the use of dependence higraphs, a novel representation technique that, unlike the graph-based representation techniques commonly used by software tools, provides a hierarchical model that makes software systems suitable for the application of efficient tree mining algorithms. Moreover, our representation model is explicitly designed for making program element matching easier under a wide variety of circumstances, a task at the heart of many software mining problems.}, booktitle = {V Taller de Miner\'{i}a de Datos y Aprendizaje {(TAMIDA} '07)}, author = {Berzal, Fernando and Cubero, {Juan-Carlos} and Jimenez, Aida}, month = sep, year = {2007}, keywords = {clustering, data mining, software clustering} }, @inproceedings{abreu_object-oriented_1994, title = {Object-oriented software engineering: Measuring and controlling the development process}, abstract = {Although the benefits of {Object-Orientation} are manifold and it is, for certain, one of the mainstays for software production in the future, it will only achieve widespread practical acceptance when the management aspects of the software development process using this technology are carefully addressed. Here, software metrics play an important role allowing, among other things, better planning, the assessment of improvements, the reduction of unpredictability, early identification of potential problems and productivity evaluation. This paper proposes a set of metrics suitable for evaluating the use of the main abstractions of the {Object-Oriented} paradigm such as inheritance, encapsulation, information hiding or polymorphism and the consequent emphasis on reuse that, together, are believed to be responsible for the increase in software quality and development productivity. Those metrics are aimed at helping to establish comparisons throughout the practitioners{\textquoteright} community and setting design recommendations that may eventually become organization standards. Some desirable properties for such a metrics set are also presented. Future lines of research are envisaged.}, booktitle = {proceedings of the 4th International Conference on Software Quality}, author = {Abreu, F. B. and Carapu\c{c}a, R.}, year = {1994}, keywords = {metrics} }, @book{henney_97_2010, title = {97 Things Every Programmer Should Know: Collective Wisdom from the Experts}, isbn = {9780596809485}, shorttitle = {97 Things Every Programmer Should Know}, publisher = {{O'Reilly} Media, Inc.}, author = {Henney, Kevlin}, month = feb, year = {2010} }, @incollection{beyer_impact_2001, title = {Impact of Inheritance on Metrics for Size, Coupling, and Cohesion in {Object-Oriented} Systems}, url = {http://dx.doi.org/10.1007/3-540-44704-0_1}, abstract = {In today{\textquoteright}s engineering of object oriented systems many different metrics are used to get feedback about design quality and to automatically identify design weaknesses. While the concept of inheritance is covered by special inheritance metrics its impact on other classical metrics (like size, coupling or cohesion metrics) is not considered; this can yield misleading measurement values and false interpretations. In this paper we present an approach to work the concept of inheritance into classical metrics (and with it the related concepts of overriding, overloading and polymorphism). This is done by some language dependent flattening functions that modify the data on which the measurement will be done. These functions are implemented within our metrics tool Crocodile and are applied for a case study: the comparison of the measurement values of the original data with the measurement values of the flattened data yields interesting results and improves the power of classical measurements for interpretation.}, booktitle = {New Approaches in Software Measurement}, author = {Beyer, Dirk and Lewerentz, Claus and Simon, Frank}, year = {2001}, keywords = {cohesion, coupling, inheritance, metrics, size}, pages = {1--17} }, @inproceedings{mitchell_comparing_2001, address = {Los Alamitos, {CA}, {USA}}, title = {Comparing the Decompositions Produced by Software Clustering Algorithms using Similarity Measurements}, doi = {http://doi.ieeecomputersociety.org/10.1109/ICSM.2001.972795}, abstract = {Decomposing source code components and relations into subsystem clusters is an active area of research. Numerous clustering approaches have been proposed in the reverse engineering literature, each one using a different algorithm to identify subsystems. Since different clustering techniques may not produce identical results when applied to the same system, mechanisms that can measure the extent of these differences are needed. Some work to measure the similarity between decompositions has been done, but this work considers the assignment of source code components to clusters as the only criterion for similarity. We argue that better similarity measurements can be designed if the relations between the components are {considered.In} this paper we propose two similarity measurements that overcome certain problems in existing measurements. We also provide some suggestions on how to identify and deal with source code components that tend to contribute to poor similarity results. We conclude by presenting experimental results, and by highlighting some of the benefits of our similarity measurements.}, booktitle = {{IEEE} International Conference on Software Maintenance}, publisher = {{IEEE} Computer Society}, author = {Mitchell, Brian S. and Mancoridis, Spiros}, year = {2001}, keywords = {cluster evaluation, clustering, evaluation, similarity, software clustering, software maintenance}, pages = {744} }, @incollection{grcar_using_2008, title = {Using Text Mining and Link Analysis for Software Mining}, url = {http://dx.doi.org/10.1007/978-3-540-68416-9_1}, abstract = {Many data mining techniques are these days in use for ontology learning {\textendash} text mining, Web mining, graph mining, link analysis, relational data mining, and so on. In the current state-of-the-art bundle there is a lack of {\textquotedblleft}software mining{\textquotedblright} techniques. This term denotes the process of extracting knowledge out of source code. In this paper we approach the software mining task with a combination of text mining and link analysis techniques. We discuss how each instance (i.e. a programming construct such as a class or a method) can be converted into a feature vector that combines the information about how the instance is interlinked with other instances, and the information about its (textual) content. The so-obtained feature vectors serve as the basis for the construction of the domain ontology with {OntoGen}, an existing system for semi-automatic data-driven ontology construction.}, booktitle = {Mining Complex Data}, author = {Grcar, Miha and Grobelnik, Marko and Mladenic, Dunja}, year = {2008}, keywords = {clustering, data mining, graphs, similarity, software clustering}, pages = {1--12}, annote = {Somewhat difficult to read in places. Talks about feature vectors and combining different graphs of inter-related classes. Relevance: 5} }, @misc{scientific_toolworks_inc._understand_2009, title = {Understand 2.0 User Guide and Reference Manual}, url = {http://getunderstand.com/documents/manuals/pdf/understand.pdf}, publisher = {Scientific Toolworks, Inc.}, author = {Scientific Toolworks, Inc.}, month = jan, year = {2009}, keywords = {metrics} }, @book{brown_antipatterns:_1998, edition = {1}, title = {{AntiPatterns:} Refactoring Software, Architectures, and Projects in Crisis}, isbn = {0471197130}, shorttitle = {{AntiPatterns}}, publisher = {Wiley}, author = {Brown, William J. and Malveau, Raphael C. and {McCormick}, Hays W. {"Skip"} and Mowbray, Thomas J.}, month = apr, year = {1998} }, @inproceedings{balazinska_partial_1999, title = {Partial Redesign of Java Software Systems Based on Clone Analysis}, isbn = {0-7695-0303-9}, url = {http://portal.acm.org/citation.cfm?id=837061}, abstract = {Code duplication, plausibly caused by copying source code and slightly modifying it, is often observed in large systems. Clone detection and documentation have been investigated by several researchers in the past years. Recently, research focus has shifted towards the investigation of software and process restructuring actions based on clone {detection.This} paper presents a new redesign approach developed for Java software systems. The approach factorizes the common parts of cloned methods and parameterizes their differences using the strategy design pattern. The new entities created by such transformations are also decoupled from the original contexts of their use thus facilitating reuse and increasing maintainability. The applicability and automation of the technique presented in the paper have been verified by partially redesigning {JDK} 1.1.5.}, booktitle = {Proceedings of the Sixth Working Conference on Reverse Engineering}, publisher = {{IEEE} Computer Society}, author = {Balazinska, Magdalena and Merlo, Ettore and Dagenais, Michel and Lague, Bruno and Kontogiannis, Kostas}, year = {1999}, keywords = {clone detection, design patterns}, pages = {326} }, @article{andritsos_information-theoretic_2005, title = {Information-theoretic software clustering}, volume = {31}, issn = {0098-5589}, doi = {10.1109/TSE.2005.25}, abstract = {The majority of the algorithms in the software clustering literature utilize structural information to decompose large software systems. Approaches using other attributes, such as file names or ownership information, have also demonstrated merit. At the same time, existing algorithms commonly deem all attributes of the software artifacts being clustered as equally important, a rather simplistic assumption. Moreover, no method that can assess the usefulness of a particular attribute for clustering purposes has been presented in the literature. In this paper, we present an approach that applies information theoretic techniques in the context of software clustering. Our approach allows for weighting schemes that reflect the importance of various attributes to be applied. We introduce {LIMBO}, a scalable hierarchical clustering algorithm based on the minimization of information loss when clustering a software system. We also present a method that can assess the usefulness of any nonstructural attribute in a software clustering context. We applied {LIMBO} to three large software systems in a number of experiments. The results indicate that this approach produces clusterings that come close to decompositions prepared by system experts. Experimental results were also used to validate our usefulness assessment method. Finally, we experimented with well-established weighting schemes from information retrieval, Web search, and data clustering. We report results as to which weighting schemes show merit in the decomposition of software systems.}, number = {2}, journal = {Software Engineering, {IEEE} Transactions on}, author = {Andritsos, P. and Tzerpos, V.}, year = {2005}, keywords = {clustering, information retrieval, maintenance, metrics, reverse engineering, software clustering}, pages = {150--165} }, @inproceedings{baxter_clone_1998, title = {Clone Detection Using Abstract Syntax Trees}, isbn = {0-8186-8779-7}, url = {http://portal.acm.org/citation.cfm?id=850947.853341&coll=GUIDE&dl=GUIDE&CFID=12519657&CFTOKEN=83976739}, abstract = {Existing research suggests that a considerable fraction (5-10\%) of the source code of large-scale computer programs is duplicate code ("clones"). Detection and removal of such clones promises decreased software maintenance costs of possibly the same magnitude. Previous work was limited to detection of either near-misses differing only in single lexems, or near misses only between complete functions. This paper presents simple and practical methods for detecting exact and near miss clones over arbitrary program fragments in program source code by using abstract syntax trees. Previous work also did not suggest practical means for removing detected clones. Since our methods operate in terms of the program structure, clones could be removed by mechanical methods producing in-lined procedures or standard preprocessor {macros.A} tool using these techniques is applied to a C production software system of some {400K} source lines, and the results confirm detected levels of duplication found by previous work. The tool produces macro bodies needed for clone removal, and macro invocations to replace the clones. The tool uses a variation of the well-known compiler method for detecting common sub-expressions. This method determines exact tree matches; a number of adjustments are needed to detect equivalent statement sequences, commutative operands, and nearly exact matches. We additionally suggest that clone detection could also be useful in producing more structured code, and in reverse engineering to discover domain concepts and their implementations.}, booktitle = {Proceedings of the International Conference on Software Maintenance}, publisher = {{IEEE} Computer Society}, author = {Baxter, Ira D. and Yahin, Andrew and Moura, Leonardo and {Sant'Anna}, Marcelo and Bier, Lorraine}, year = {1998}, keywords = {clones, maintenance, metrics}, pages = {368} }, @article{mittal_knowledge_1985, title = {Knowledge Acquisition from Multiple Experts}, volume = {6}, url = {http://www.aaai.org/ojs/index.php/aimagazine/article/viewArticle/477}, abstract = {Expert system projects are often based on collaboration with single domain expert. This leads to difficulties in judging the suitability of the chosen task and in acquiring the detailed knowledge required to carry out the task. This anecdotal article considers some of the advantages of using a diverse collection of domain experts.}, number = {2}, journal = {{AI} Magazine}, author = {Mittal, Sanjay and Dym, Clive}, month = jun, year = {1985}, keywords = {knowledge acquisition}, pages = {32--36} }, @inproceedings{liu_modeling_2009, address = {Los Alamitos, {CA}, {USA}}, title = {Modeling class cohesion as mixtures of latent topics}, isbn = {978-1-4244-4897-5}, doi = {http://doi.ieeecomputersociety.org/10.1109/ICSM.2009.5306318}, abstract = {The paper proposes a new measure for the cohesion of classes in {Object-Oriented} software systems. It is based on the analysis of latent topics embedded in comments and identifiers in source code. The measure, named as Maximal Weighted Entropy, utilizes the Latent Dirichlet Allocation technique and information entropy measures to quantitatively evaluate the cohesion of classes in software. This paper presents the principles and the technology that stand behind the proposed measure. Two case studies on a large open source software system are presented. They compare the new measure with an extensive set of existing metrics and use them to construct models that predict software faults. The case studies indicate that the novel measure captures different aspects of class cohesion compared to the existing cohesion measures and improves fault prediction for most metrics, which are combined with Maximal Weighted Entropy.}, booktitle = {{IEEE} International Conference on Software Maintenance}, publisher = {{IEEE} Computer Society}, author = {Liu, Yixun and Poshyvanyk, Denys and Ferenc, Rudolf and Gyimothy, Tibor and Chrisochoides, Nikos}, year = {2009}, keywords = {cohesion, metrics, semantics}, pages = {233--242} }, @techreport{roy_survey_2007, address = {Ontario, Canada}, type = {Technical Report}, title = {A Survey on Software Clone Detection Research}, url = {http://serv2.ist.psu.edu:8080/viewdoc/summary?doi=10.1.1.62.7869}, abstract = {Code duplication or copying a code fragment and then reuse by pasting with or without any modifications is a well known code smell in software maintenance. Several studies show that about 5\% to 20\% of a software systems can contain duplicated code, which is basically the results of copying existing code fragments and using then by pasting with or without minor modifications. One of the major shortcomings of such duplicated fragments is that if a bug is detected in a code fragment, all the other fragments similar to it should be investigated to check the possible existence of the same bug in the similar fragments. Refactoring of the duplicated code is another prime issue in software maintenance although several studies claim that refactoring of certain clones are not desirable and there is a risk of removing them. However, it is also widely agreed that clones should at least be detected. In this paper, we survey the state of the art in clone detection research. First, we describe the clone terms commonly used in the literature along with their corresponding mappings to the commonly used clone types. Second, we provide a review of the existing clone taxonomies, detection approaches and experimental evaluations of clone detection tools. Applications of clone detection research to other domains of software engineering and in the same time how other domain can assist clone detection research have also been pointed out. Finally, this paper concludes by pointing out several open problems related to clone detection research.}, number = {2007-541}, institution = {School of Computing Queen{\textquoteright}s University at Kingston}, author = {Roy, Chanchal Kumar and Cordy, James R}, year = {2007}, keywords = {clones, survey} }, @inproceedings{swanson_dimensions_1976, address = {San Francisco, California, United States}, title = {The dimensions of maintenance}, url = {http://portal.acm.org/citation.cfm?id=800253.807723}, abstract = {The area of software maintenance has been described by one author as an {\textquotedblleft}iceberg.{\textquotedblright} {(EDP} Analyzer, 1972) Much goes on here that does not currently meet the eye. In part, this is the consequence of measurement difficulties. Practitioners and researchers can benefit from an understanding of the {\textquotedblleft}dimensionality{\textquotedblright} of the maintenance problem. Some measures are suggested for coming to grips with this dimensionality, and problems of utilization associated with these measures are explored.}, booktitle = {Proceedings of the 2nd international conference on Software engineering}, publisher = {{IEEE} Computer Society Press}, author = {Swanson, E. Burton}, year = {1976}, keywords = {maintenance, performance measurement}, pages = {492--497} }, @article{flake_self-organization_2002, title = {{Self-Organization} and Identification of Web Communities}, volume = {35}, url = {http://portal.acm.org/citation.cfm?id=621934}, abstract = {Millions of individuals operating independently author the Web's information. Despite its decentralized nature, the authors' work shows that the Web self-organizes and its link structure allows efficient identification of communities. This is significant because no central authority or process governs hyperlink formation and {structure.A} Web community is a collection of Web pages in which each member page has more hyperlinks within the community than outside it. Compared to previous methods of finding related Web pages,the authors describe an approach that retains the transparency of methods such as cocitation and bibliographic coupling in explaining why pages belong to a community,yet can identify Web communities of arbitrary dimensions. Applications of their method include creating improved search engines, content filtering, and objective analysis of Web content and the relationships between Web communities.}, number = {3}, journal = {Computer}, author = {Flake, Gary William and Lawrence, Steve and Giles, C. Lee and Coetzee, Frans M.}, year = {2002}, keywords = {clustering, graph algorithms, graphs}, pages = {66--71} }, @inproceedings{tairas_visualizing_2007, address = {New York, {NY}, {USA}}, title = {Visualizing clone detection results}, isbn = {978-1-59593-882-4}, location = {Atlanta, Georgia, {USA}}, doi = {http://doi.acm.org/10.1145/1321631.1321738}, booktitle = {{ASE} '07: Proceedings of the twenty-second {IEEE/ACM} international conference on Automated software engineering}, publisher = {{ACM}}, author = {Tairas, Robert and Gray, Jeff and Baxter, Ira D.}, year = {2007}, keywords = {clones, visualization}, pages = {549{\textemdash}550}, annote = {Discusses clone visualization where classes are represented as rectangles and various clones are represented as different colored stripes of various thicknesses. Unfortunately, the inheritance hierarchy does not seem to be represented. Relevance: 3 } }, @misc{_freecol_2011, title = {{FreeCol} - Home}, url = {http://www.freecol.org/}, year = {2011}, note = {Accessed 2011-10-03}, howpublished = {http://www.freecol.org/} }, @article{proulx_network_2005, title = {Network thinking in ecology and evolution}, volume = {20}, issn = {0169-5347}, url = {http://www.ncbi.nlm.nih.gov/pubmed/16701391}, doi = {10.1016/j.tree.2005.04.004}, abstract = {Although pairwise interactions have always had a key role in ecology and evolutionary biology, the recent increase in the amount and availability of biological data has placed a new focus on the complex networks embedded in biological systems. The increased availability of computational tools to store and retrieve biological data has facilitated wide access to these data, not just by biologists but also by specialists from the social sciences, computer science, physics and mathematics. This fusion of interests has led to a burst of research on the properties and consequences of network structure in biological systems. Although traditional measures of network structure and function have started us off on the right foot, an important next step is to create biologically realistic models of network formation, evolution, and function. Here, we review recent applications of network thinking to the evolution of networks at the gene and protein level and to the dynamics and stability of communities. These studies have provided new insights into the organization and function of biological systems by applying existing techniques of network analysis. The current challenge is to recognize the commonalities in evolutionary and ecological applications of network thinking to create a predictive science of biological networks.}, number = {6}, journal = {Trends in Ecology \& Evolution {(Personal} Edition)}, author = {Proulx, Stephen R and Promislow, Daniel E L and Phillips, Patrick C}, month = jun, year = {2005}, note = {{PMID:} 16701391}, pages = {345--353} }, @article{moody_structural_2003, title = {Structural cohesion and embeddedness: A hierarchical concept of social groups}, volume = {68}, shorttitle = {Structural cohesion and embeddedness}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.18.5695}, doi = {10.1.1.18.5695}, journal = {American Sociological Review}, author = {Moody, James and White, Douglas R}, year = {2003}, keywords = {cohesion, coupling, graphs, metrics, {SNA}}, pages = {103---127}, annote = {This is a paper about coupling and cohesion from a social scientist's perspective. It is more interesting than most in that it considers hierarchical clusters, i.e. clusters of clusters. Relevance:4} }, @inproceedings{wallnau_construction_1988, address = {George Mason University}, title = {Construction of knowledge-based components and applications in Ada}, booktitle = {Proceedings of {AIDA-88}, Fourth Annual Conference on Artificial Intelligence and Ada}, author = {Wallnau, K. and Solderitsch, J. and Simos, M. and {McDowell}, R. and Cassell, K. and Campbell, D.}, month = nov, year = {1988}, pages = {3} }, @inproceedings{yan_gspan:_2002, address = {Los Alamitos, {CA}, {USA}}, title = {{gSpan:} {Graph-Based} Substructure Pattern Mining}, isbn = {0-7695-1754-4}, shorttitle = {{gSpan}}, doi = {http://doi.ieeecomputersociety.org/10.1109/ICDM.2002.1184038}, abstract = {We investigate new approaches for frequent graph-based pattern mining in graph datasets and propose a novel algorithm called {gSpan} (graph-based Substructure pattern mining), which discovers frequent substructures without candidate generation. {gSpan} builds a new lexicographic order among graphs, and maps each graph to a unique minimum {DFS} code as its canonical label. Based on this lexico-graphic order, {gSpan} adopts the depth-first search strategy to mine frequent connected subgraphs efficiently. Our performance study shows that {gSpan} substantially outperforms previous algorithms, sometimes by an order of magnitude.}, booktitle = {Data Mining, {IEEE} International Conference on}, publisher = {{IEEE} Computer Society}, author = {Yan, Xifeng and Han, Jiawei}, year = {2002}, pages = {721} }, @inproceedings{chiricota_software_2003, title = {Software components capture using graph clustering}, abstract = {We describe a simple, fast computing and easy to implement method for finding relatively good clusterings of software systems. Our method relies on the ability to compute the strength of an edge in a graph by applying a straightforward metric defined in terms of the neighborhoods of its end vertices. The metric is used to identify the weak edges of the graph, which are momentarily deleted to break it into several components. We study the quality {metricMQ} introduced in [1] and exhibit mathematical properties that make it a good measure for clustering quality. Letting the threshold weakness of edges vary defines a path, i.e. a sequence of clusterings in the solution space (of all possible clustering of the graph). This path is described in terms of a curve linking {MQ} to the weakness of the edges in the graph}, booktitle = {11th {IEEE} International Workshop on Program Comprehension}, author = {Chiricota, Y. and Jourdan, F. and Melan\c{c}on, G.}, year = {2003}, keywords = {clustering, graphs, software clustering, software modules, subsystem identification} }, @inproceedings{riaz_systematic_2009, title = {A systematic review of software maintainability prediction and metrics}, isbn = {978-1-4244-4842-5}, url = {http://portal.acm.org/citation.cfm?id=1671283&dl=GUIDE&coll=GUIDE&CFID=95908762&CFTOKEN=43459678}, abstract = {This paper presents the results of a systematic review conducted to collect evidence on software maintainability prediction and metrics. The study was targeted at the software quality attribute of maintainability as opposed to the process of software maintenance. The evidence was gathered from the selected studies against a set of meaningful and focused questions. 710 studies were initially retrieved; however of these only 15 studies were selected; their quality was assessed; data extraction was performed; and data was synthesized against the research questions. Our results suggest that there is little evidence on the effectiveness of software maintainability prediction techniques and models.}, booktitle = {Proceedings of the 2009 3rd International Symposium on Empirical Software Engineering and Measurement}, publisher = {{IEEE} Computer Society}, author = {Riaz, Mehwish and Mendes, Emilia and Tempero, Ewan}, year = {2009}, keywords = {maintainability, metrics, survey}, pages = {367--377}, annote = {Close to useless - "710 studies were initially retrieved; however of these only 15 studies were selected".} }, @book{binkley_beyond_2006, series = {Dagstuhl Seminar Proceedings}, title = {Beyond Program Slicing, 06.11. - 11.11.2005}, volume = {05451}, publisher = {Internationales Begegnungs- und Forschungszentrum fuer Informatik {(IBFI)}, Schloss Dagstuhl, Germany}, editor = {Binkley, David and Harman, Mark and Krinke, Jens}, year = {2006} }, @inproceedings{etzkorn_semantic_2006, title = {Semantic Metrics, Conceptual Metrics, and Ontology Metrics: An Analysis of Software Quality Using {IR-based} Systems, Potential Applications and Collaborations}, abstract = {Similarities and differences between {\textquotedblleft}semantic metrics{\textquotedblright} (metrics defined on a knowledge-based {IR} system) and {\textquotedblleft}conceptual metrics{\textquotedblright} (metrics defined on a Latent Semantic Indexing-based {IR} system) are discussed. Potential collaboration areas between research groups are identified. Potential application and collaboration areas of a new research area called {\textquotedblleft}ontology metrics,{\textquotedblright} metrics calculated on the ontologies that form part of an ontology-based software system, are also discussed. Currently ontology metrics are calculated using techniques similar to semantic metrics, but other semantically-based expansions, including some similar to conceptual metrics are possible.}, booktitle = {Proc. Int. Conf. Software Maintenance}, author = {Etzkorn, L. H.}, year = {2006}, keywords = {cohesion, metrics, ontologies, semantics}, annote = {Talks about metrics that are defined in the context of comment and identifier analysis within a program understanding system based on knowledge-based natural language understanding. Relevance: 4} }, @book{wasserman_social_1994, address = {Cambridge}, series = {Structural analysis in the social sciences}, title = {Social Network Analysis: Methods and Applications}, isbn = {0521387078}, lccn = {{HM131} {.W356} 1994}, shorttitle = {Social Network Analysis}, number = {8}, publisher = {Cambridge University Press}, author = {Wasserman, Stanley and Faust, Katherine}, year = {1994}, keywords = {{SNA}} }, @misc{_iso9126_????, title = {{ISO9126} - Software Quality Characteristics}, url = {http://www.sqa.net/iso9126.html}, abstract = {An overview of the {ISO} 9126-1 software quality model definition, with an explanation of the major characteristics.}, keywords = {{ISO} 9126 model, maintainability, testing}, howpublished = {http://www.sqa.net/iso9126.html} }, @book{iso_international_2006, edition = {2}, title = {International Standard - {ISO/IEC} 14764 {IEEE} Std 14764-2006 - Software Engineering - Software Life Cycle Processes - Maintenance}, isbn = {0-7381-4961-6 {SS95534}}, abstract = {The process for managing and executing software maintenance activities is described.}, publisher = {{IEEE}}, author = {{ISO}, {IEEE}}, month = sep, year = {2006}, keywords = {maintenance}, annote = {Types of maintenance: * Corrective maintenance: Reactive modification of a software product performed after delivery to correct discovered problems. * Adaptive maintenance: Modification of a software product performed after delivery to keep a software product usable in a changed or changing environment. * Perfective maintenance: Modification of a software product after delivery to improve performance or maintainability. * Preventive maintenance: Modification of a software product after delivery to detect and correct latent faults in the software product before they become effective faults. Relevance: 3} }, @inproceedings{kegel_systematically_2008, address = {Leipzig, Germany}, title = {Systematically refactoring inheritance to delegation in Java}, isbn = {978-1-60558-079-1}, url = {http://portal.acm.org/citation.cfm?id=1368147&dl=GUIDE&coll=GUIDE&CFID=37949062&CFTOKEN=90965216}, doi = {10.1145/1368088.1368147}, abstract = {Because of the strong coupling of classes and the proliferation of unneeded class members induced by inheritance, the suggestion to use composition and delegation instead has become commonplace. The presentation of a corresponding refactoring in the literature may lead one to believe that such a transformation is a straightforward undertaking. However, closer analysis reveals that this refactoring is neither always possible, nor does it necessarily achieve its desired effect. We have therefore identified the necessary preconditions and realizable postconditions of the refactoring, and built a tool that can perform it completely automatically. By applying this tool to all subclasses of several open-source projects, we have collected evidence of the applicability of the refactoring and of its capability to deliver on its promises. The refactoring builds on constraint graphs originally developed for type inference to check the preconditions and to compute the necessary delegation as well as the subtype relationships that must be maintained.}, booktitle = {Proceedings of the 30th International Conference on Software Engineering}, publisher = {{ACM}}, author = {Kegel, Hannes and Steimann, Friedrich}, year = {2008}, keywords = {delegation, inheritance, open recursion, refactoring}, pages = {431--440}, annote = {This is a well-written paper that goes into detail about the work needed to convert an inheritance relationship into a delegation relationship. It even discusses {IntelliJ's} implementation of the refactoring and its problems. Relevance: 4} }, @article{miller_wordnet:_1995, title = {{WordNet:} a lexical database for English}, volume = {38}, shorttitle = {{WordNet}}, url = {http://portal.acm.org/citation.cfm?id=219748&dl=}, doi = {10.1145/219717.219748}, abstract = {Because meaningful sentences are composed of meaningful words, any system that hopes to process natural languages as people do must have information about words and their meanings. This information is traditionally provided through dictionaries, and machine-readable dictionaries are now widely available. But dictionary entries evolved for the convenience of human readers, not for machines. {WordNet1} provides a more effective combination of traditional lexicographic information and modern computing. {WordNet} is an online lexical database designed for use under program control. English nouns, verbs, adjectives, and adverbs are organized into sets of synonyms, each representing a lexicalized concept. Semantic relations link the synonym sets [4].}, number = {11}, journal = {Commun. {ACM}}, author = {Miller, George A.}, year = {1995}, keywords = {semantic networks, semantics}, pages = {39--41} }, @article{busygin_biclustering_2008, title = {Biclustering in data mining}, volume = {35}, issn = {0305-0548}, url = {http://www.sciencedirect.com/science/article/B6VC5-4N0HJJD-3/2/93cb7d7b3edb8b3cd7a0d402cdf31f2e}, doi = {10.1016/j.cor.2007.01.005}, abstract = {Biclustering consists in simultaneous partitioning of the set of samples and the set of their attributes (features) into subsets (classes). Samples and features classified together are supposed to have a high relevance to each other. In this paper we review the most widely used and successful biclustering techniques and their related applications. This survey is written from a theoretical viewpoint emphasizing mathematical concepts that can be met in existing biclustering techniques.}, number = {9}, journal = {Computers \& Operations Research}, author = {Busygin, Stanislav and Prokopyev, Oleg and Pardalos, Panos M.}, month = sep, year = {2008}, keywords = {Biclustering, clustering, data mining, survey}, pages = {2964--2987} }, @inproceedings{marcus_semantic_2004, title = {Semantic Driven Program Analysis}, isbn = {0-7695-2213-0}, url = {http://portal.acm.org/citation.cfm?id=1018431.1021459&coll=GUIDE&dl=GUIDE&CFID=101258307&CFTOKEN=28035505}, abstract = {The paper presents an approach to extract and to analyze the semantic content (i.e., problem and solution domain semantics) of existing software systems to support program understanding and software various maintenance tasks, such as: recovery of traceability links between documentation and source code, identification of abstract data types in legacy code, and identification of high-level concept clones in software. The semantic information is derived from the comments, documentation, and identifier names associated with the source code using information retrieval methods. The paper advocates for the use of latent semantic indexing as the underlying support for the semantic driven analysis. The presented results are based on the author's doctoral dissertation [12].}, booktitle = {Proceedings of the 20th {IEEE} International Conference on Software Maintenance}, publisher = {{IEEE} Computer Society}, author = {Marcus, Andrian}, year = {2004}, keywords = {program comprehension, semantics}, pages = {469--473} }, @inproceedings{nasseri_empirical_2010, address = {Montreal, {QC}, Canada}, title = {An Empirical Study of {Fan-In} and {Fan-Out} in Java {OSS}}, url = {http://ieeexplore.ieee.org.helicon.vuw.ac.nz/search/searchresult.jsp?newsearch=true&queryText=An+Empirical+Study+of+Fan-In+and+Fan-Out+in+Java&x=24&y=15}, doi = {10.1109/SERA.2010.15}, abstract = {Coupling is a well researched topic in the {Object-Oriented} {(OO)} research community and its influence on class cohesion is well understood. In this paper, we present an empirical study exploring the effect of method calling on class cohesion using two coupling metrics, namely fan-in and fan-out. Three Java, open-source systems {(OSS)} were used as a basis of the study. A small number of classes were found to account for the vast majority of fan-in and fan-out. We also found the impact of fan-out on class cohesion to be higher than that of fan-in. Classes containing fan-out tended to have lower cohesion than those containing fan-in.}, booktitle = {2010 Eighth {ACIS} International Conference on Software Engineering Research, Management and Applications}, author = {Nasseri, E. and Counsell, S. and Tempero, E.}, month = may, year = {2010}, keywords = {coupling, empirical, metrics}, pages = {36--41} }, @inproceedings{neate_coderank:_2006, address = {Los Alamitos, {CA}, {USA}}, title = {{CodeRank:} A New Family of Software Metrics}, shorttitle = {{CodeRank}}, doi = {http://doi.ieeecomputersociety.org/10.1109/ASWEC.2006.21}, abstract = {The concept of pagerank has proved successful in allowing search engines to identify important pages in the World Wide Web. In this paper, we describe the application of the pagerank concept to the domain of software in order to derive a new family of metrics, {CodeRank}, which captures aspects of software not readily obtainable from other metrics. We have implemented a tool, {CODERANKER}, to compute values of {CodeRank} metrics using a full semantic model which we have developed. We present some results and discuss the use of {CodeRank} metrics in their interpretation.}, booktitle = {Software Engineering Conference, Australian}, publisher = {{IEEE} Computer Society}, author = {Neate, Blair and Irwin, Warwick and Churcher, Neville}, year = {2006}, pages = {369--378} }, @article{smith_obo_2007, title = {The {OBO} Foundry: coordinated evolution of ontologies to support biomedical data integration}, volume = {25}, issn = {1087-0156}, shorttitle = {The {OBO} Foundry}, url = {http://dx.doi.org/10.1038/nbt1346}, doi = {10.1038/nbt1346}, number = {11}, journal = {Nature Biotechnology}, author = {Smith, Barry and Ashburner, Michael and Rosse, Cornelius and Bard, Jonathan and Bug, William and Ceusters, Werner and Goldberg, Louis J and Eilbeck, Karen and Ireland, Amelia and Mungall, Christopher J and Leontis, Neocles and {Rocca-Serra}, Philippe and Ruttenberg, Alan and Sansone, {Susanna-Assunta} and Scheuermann, Richard H and Shah, Nigam and Whetzel, Patricia L and Lewis, Suzanna}, month = nov, year = {2007}, pages = {1251--1255} }, @inproceedings{verbaere_scripting_2006, address = {Portland, Oregon, {USA}}, title = {Scripting refactorings with {JunGL}}, isbn = {{1-59593-491-X}}, url = {http://portal.acm.org/citation.cfm?id=1176656#}, doi = {10.1145/1176617.1176656}, abstract = {We describe {JunGL}, a language to script refactoring transformations. It manipulates a graph representation of the program, including extensible semantic information such as variable binding and dataflow. {JunGL} enables the full automation of complex refactorings: finding program elements of interest, checking preconditions and performing the transformation itself.}, booktitle = {Companion to the 21st {ACM} {SIGPLAN} symposium on Object-oriented programming systems, languages, and applications}, publisher = {{ACM}}, author = {Verbaere, Mathieu and Payement, Arnaud and Moor, Oege de}, year = {2006}, keywords = {refactoring, scripting language}, pages = {651--652}, annote = {Talks about a scripting language for transformations that is a hybrid of a functional and a logical query language. It has been used to recognize and perform three different refactorings. Relevance: 5} }, @inproceedings{tsantalis_jdeodorant:_2008, title = {{JDeodorant:} Identification and Removal of {Type-Checking} Bad Smells}, shorttitle = {{JDeodorant}}, url = {http://dx.doi.org/10.1109/CSMR.2008.4493342}, abstract = {In this demonstration, we present an Eclipse plug-in that automatically identifies type-checking bad smells in Java source code, and resolves them by applying the "replace conditional with polymorphism" or "replace type code with state/strategy " refactorings. To the best of our knowledge there is a lack of tools that identify type-checking bad smells. Moreover, none of the state-of-the-art {IDEs} support the refactorings that resolve such kind of bad smells.}, booktitle = {Software Maintenance and Reengineering, 2008. {CSMR} 2008. 12th European Conference on}, author = {Tsantalis, N and Chaikalis, T and Chatzigeorgiou, A}, year = {2008}, keywords = {refactoring, smells}, pages = {331, 329}, annote = {The authors present how type checking (switches and conditionals over type-like information) can be detected and refactored. The implementation described can detect and refactor 3 well-known examples.} }, @phdthesis{skowronski_jrepackager_2005, type = {Master's thesis}, title = {{JRepackager} a tool for extending package level refactoring /}, url = {http://worldcat.org/oclc/73801302}, school = {University of Illinois at {Urbana-Champaign}}, author = {Skowronski, Jason Daniel}, year = {2005}, keywords = {Eclipse, metrics, refactoring}, annote = {Worked with Danny Dig. Used the Eclipse metrics plug-in.} }, @incollection{salvetti_local_2005, title = {Local Flow Betweenness Centrality for Clustering Community Graphs}, url = {http://dx.doi.org/10.1007/11600930_53}, abstract = {The problem of information flow is studied to identify de facto communities of practice from tacit knowledge sources that reflect the underlying community structure, using a collection of instant message logs. We characterize and model the community detection problem using a combination of graph theory and ideas of centrality from social network analysis. We propose, validate, and develop a novel algorithm to detect communities based on computation of the Local Flow Betweenness Centrality. Using {LFBC}, we model the weights on the edges in the graph so we can extract communities. We also present how to compute efficiently {LFBC} on relevant edges without having to recalculate the measure for each edge in the graph during the process. We validate our algorithms on a corpus of instant messages that we call {MLog.} Our results demonstrate that {MLogs} are a useful source for community detection that can augment the study of collaborative behavior.}, booktitle = {Internet and Network Economics}, author = {Salvetti, Franco and Srinivasan, Savitha}, year = {2005}, keywords = {graphs, {SNA}}, pages = {531--544} }, @article{emam_confounding_2001, title = {The confounding effect of class size on the validity of object-oriented metrics}, volume = {27}, url = {http://portal.acm.org/citation.cfm?id=381782.381786}, abstract = {Much effort has been devoted to the development and empirical validation of object-oriented metrics. The empirical validations performed thus far would suggest that a core set of validated metrics is close to being identified. However, none of these studies allow for the potentially confounding effect of class size. In this paper, we demonstrate a strong size confounding effect and question the results of previous object-oriented metrics validation studies. We first investigated whether there is a confounding effect of class size in validation studies of object-oriented metrics and show that, based on previous work, there is reason to believe that such an effect exists. We then describe a detailed empirical methodology for identifying those effects. Finally, we perform a study on a large C++ telecommunications framework to examine if size is really a confounder. This study considered the Chidamber and Kemerer metrics and a subset of the Lorenz and Kidd metrics. The dependent variable was the incidence of a fault attributable to a field failure (fault-proneness of a class). Our findings indicate that, before controlling for size, the results are very similar to previous studies: The metrics that are expected to be validated are indeed associated with fault-proneness. After controlling for size, none of the metrics we studied were associated with fault-proneness anymore. This demonstrates a strong size confounding effect and casts doubt on the results of previous object-oriented metrics validation studies. It is recommended that previous validation studies be reexamined to determine whether their conclusions would still hold after controlling for size and that future validation studies should always control for size.}, number = {7}, journal = {{IEEE} Trans. Softw. Eng.}, author = {Emam, Kalhed El and Benlarbi, Sa\"{i}da and Goel, Nishith and Rai, Shesh N.}, year = {2001}, keywords = {coupling, empirical, metrics, metrics validation, size}, pages = {630--650} }, @inproceedings{basit_detecting_2005, address = {Lisbon, Portugal}, title = {Detecting higher-level similarity patterns in programs}, isbn = {1-59593-014-0}, url = {http://portal.acm.org/citation.cfm?id=1081706.1081733&coll=GUIDE&dl=GUIDE&CFID=11532057&CFTOKEN=67339879}, doi = {10.1145/1081706.1081733}, abstract = {Cloning in software systems is known to create problems during software maintenance. Several techniques have been proposed to detect the same or similar code fragments in software, so-called simple clones. While the knowledge of simple clones is useful, detecting design-level similarities in software could ease maintenance even further, and also help us identify reuse opportunities. We observed that recurring patterns of simple clones - so-called structural clones - often indicate the presence of interesting design-level similarities. An example would be patterns of collaborating classes or components. Finding structural clones that signify potentially useful design information requires efficient techniques to analyze the bulk of simple clone data and making non-trivial inferences based on the abstracted information. In this paper, we describe a practical solution to the problem of detecting some basic, but useful, types of design-level similarities such as groups of highly similar classes or files. First, we detect simple clones by applying conventional token-based techniques. Then we find the patterns of co-occurring clones in different files using the Frequent Itemset Mining {(FIM)} technique. Finally, we perform file clustering to detect those clusters of highly similar files that are likely to contribute to a design-level similarity pattern. The novelty of our approach is application of data mining techniques to detect design level similarities. Experiments confirmed that our method finds many useful structural clones and scales up to big programs. The paper describes our method for structural clone detection, a prototype tool called Clone Miner that implements the method and experimental results.}, booktitle = {Proceedings of the 10th European software engineering conference held jointly with 13th {ACM} {SIGSOFT} international symposium on Foundations of software engineering}, publisher = {{ACM}}, author = {Basit, Hamid Abdul and Jarzabek, Stan}, year = {2005}, keywords = {clones, design patterns, similarity}, pages = {156--165}, annote = {Relevance: 5 } }, @article{jehad_improving_2011, title = {Improving the applicability of object-oriented class cohesion metrics}, volume = {53}, issn = {0950-5849}, url = {http://www.sciencedirect.com/science/article/pii/S0950584911000632}, doi = {10.1016/j.infsof.2011.03.004}, abstract = {Context Class cohesion is an important object-oriented quality attribute. It refers to the degree of relatedness between the methods and attributes of a class. Several metrics have been proposed to measure the extent to which the class members are related. Most of these metrics have undefined values for a relatively high percentage of classes, which limits their applicability. The classes that have undefined values lack methods, attributes, or parameter types, or they include only a single method. Objective We improve the applicability of the class cohesion metrics by defining their values for such special classes. In addition, we theoretically and empirically validate the improved metrics. Method We theoretically examine whether the defined values satisfy the key cohesion properties. In addition, we empirically validate the metrics before and after the improvements to test whether the defined values improve the ability of the metrics to evaluate class cohesion. We also explore the correlation between the metrics and the presence of faulty classes to indirectly determine the strength or weakness of the metrics in indicating class quality. Results The results show that our assigned values for the undefined cases do not violate the key cohesion properties and considerably improve the ability of the metrics to explain the presence of faulty classes and may therefore improve their ability to indicate the quality of the class design. Conclusions Having the class cohesion metrics defined for all possible cases improves the applicability of the metrics and potentially increases their precision in indicating class quality.}, number = {9}, journal = {Information and Software Technology}, author = {Jehad, Al Dallal}, month = sep, year = {2011}, keywords = {Fault prediction, Metric applicability, Object-oriented class cohesion, Object-oriented software quality}, pages = {914--928} }, @misc{langpop.com_programming_2011, title = {Programming Language Popularity}, url = {http://www.langpop.com/}, author = {{LangPop.com}}, month = feb, year = {2011}, note = {Accessed February 2011}, keywords = {{OOP}, survey}, howpublished = {http://www.langpop.com/} }, @inproceedings{mitchell_comparing_2001-1, address = {Florence, Italy}, title = {Comparing the decompositions produced by software clustering algorithms using similarity measurements}, isbn = {0769511899}, abstract = {Decomposing source code components and relations into subsystem clusters is an active area of research. Numerous clustering approaches have been proposed in the reverse engineering literature, each one using a different algorithm to identify subsystems. Since different clustering techniques may not produce identical results when applied to the same system, mechanisms that can measure the extent of these differences are needed. Some work to measure the similarity between decompositions has been done, but this work considers the assignment of source code components to clusters as the only criterion for similarity. We argue that better similarity measurements can be designed if the relations between the components are considered. The authors propose two similarity measurements that overcome certain problems in existing measurements. We also provide some suggestions on how to identify and deal with source code components that tend to contribute to poor similarity results. We conclude by presenting experimental results, and by highlighting some of the benefits of our similarity measurements}, booktitle = {Proceedings. {IEEE} International Conference on Software Maintenance, 2001.}, author = {Mitchell, B. S and Mancoridis, S.}, year = {2001}, pages = {744{\textendash}753}, annote = {Addresses the problem of evaluating the results of clustering. Includes discussion of {MOJO} and precision/recall.} }, @misc{fernandez_sensitive_2006, type = {Article}, title = {A Sensitive Metric of Class Cohesion}, url = {http://sci-gems.math.bas.bg:8080/jspui/handle/10525/730}, abstract = {Metrics estimate the quality of different aspects of software. In particular, cohesion indicates how well the parts of a system hold together. A metric to evaluate class cohesion is important in object-oriented programming because it gives an indication of a good design of classes. There are several proposals of metrics for class cohesion but they have several problems (for instance, low discrimination). In this paper, a new metric to evaluate class cohesion is proposed, called {SCOM}, which has several relevant features. It has an intuitive and analytical formulation, what is necessary to apply it to large-size software systems. It is normalized to produce values in the range [0..1], thus yielding meaningful values. It is also more sensitive than those previously reported in the literature. The attributes and methods used to evaluate {SCOM} are unambiguously stated. {SCOM} has an analytical threshold, which is a very useful but rare feature in software metrics. We assess the metric with several sample cases, showing that it gives more sensitive values than other well know cohesion metrics.}, author = {Fern\'{a}ndez, Luis and Pe\~{n}a, Rosal\'{i}a}, year = {2006}, note = {Accessed 2011-11-21}, keywords = {metrics, object-oriented programming}, howpublished = {http://sci-gems.math.bas.bg:8080/jspui/handle/10525/730} }, @inproceedings{barkmann_quantitative_2009, address = {Bedford, {UK}}, title = {Quantitative evaluation of software quality metrics in open-source projects}, abstract = {The validation of software quality metrics lacks statistical significance. One reason for this is that the data collection requires quite some effort. To help solve this problem, we develop tools for metrics analysis of a large number of software projects (146 projects with ca. 70.000 classes and interfaces and over 11 million lines of code). Moreover, validation of software quality metrics should focus on relevant metrics, i.e., correlated metrics need not to be validated independently. Based on our statistical basis, we identify correlation between several metrics from well-known objectoriented metrics suites. Besides, we present early results of typical metrics values and possible thresholds.}, booktitle = {Proceedings of The 2009 {IEEE} International Workshop on Quantitative Evaluation of Large-scale Systems and Technologies {(QuEST09)}}, author = {Barkmann, H. and Lincke, R. and Lowe, W.}, month = may, year = {2009}, keywords = {empirical, metrics, metrics validation, size} }, @inproceedings{kurubus_novel_2008, address = {Istanbul, Turkey}, title = {A novel approach about cohesion measurement for classes}, abstract = {Cohesion refers to the degree of the relationships among the members in a class. A class is cohesive when its members are highly correlated. Several metrics have been proposed in the literature in order to capture class cohesion in terms of connections among members. They generally count the number of attributes used by methods or the number of methods pairs that share attributes. They constitute a restrictive way for capturing the cohesion. Because they do not consider some characteristics of classes like that special methods, disjoint interaction patterns and connectivity among class members. In this study, a new criterion, which focuses on interactions and groups between class members with considering density of connections among members and incorporates the special methods to cohesion capturing process, is presented, and a new notion about determination of class cohesion is proposed.}, booktitle = {23rd International Symposium on Computer and Information Sciences}, author = {Kurubus, O and Duru, N}, month = oct, year = {2008}, keywords = {cohesion, metrics}, pages = {1--6}, annote = {Poorly written. Not much new. Relevance: 3} }, @inproceedings{tarr_n_1999, title = {N degrees of separation: Multi-dimensional separation of concerns}, abstract = {Done well, separation of concerns can provide many software engineering benefits, including reduced complexity, improved reusability, and simpler evolution. The choice of boundaries for separate concerns depends on both requirements on the system and on the kind(s) of decomposition and composition a given formalism supports. The predominant methodologies and formalisms available, however, support only orthogonal separations of concerns, along single dimensions of composition and decomposition. These characteristics lead to a number of well-known and difficult problems. This paper describes a new paradigm for modeling and implementing software artifacts, one that permits separation of overlapping concerns along multiple dimensions of composition and decomposition. This approach addresses numerous problems throughout the software lifecycle in achieving wellengineered, evolvable, flexible software artifacts and traceability across artifacts.}, booktitle = {Proceedings of the 21st international conference on Software engineering}, publisher = {{ACM} New York, {NY}, {USA}}, author = {Tarr, P. and Ossher, H. and Harrison, W. and Sutton Jr, S. M.}, year = {1999}, keywords = {aspects, graphs, slicing}, pages = {107--119} }, @inproceedings{maekelae_external_2007, address = {Bulgaria}, title = {External views on class cohesion}, isbn = {978-954-9641-50-9}, url = {http://portal.acm.org/citation.cfm?id=1330598.1330707}, doi = {10.1145/1330598.1330707}, abstract = {Several cohesion metrics measuring quality of object-oriented programs have been proposed recently. Typically some kind of bipartite usage graph is calculated between methods of a class and its variables, and interpretations of what constitutes methods, variables, usage relation and calculation method have served as sources of variation. By advancing the usage of instance variables by the instance methods to measure relatedness of the class properties, the values given by metrics depend on implementation choices -- how the contents of an object is presented as instance variable values. Another problem is that objects often consist of property sets that are only slightly related internally, but clients of objects make such connections between the property sets by advancing internally seemingly unrelated property sets simultaneously.}, booktitle = {Proceedings of the 2007 International Conference on Computer Systems and Technologies}, publisher = {{ACM}}, author = {M\"{a}kel\"{a}, Sami and Lepp\"{a}nen, Ville}, year = {2007}, keywords = {cohesion, metrics}, pages = {1--6} }, @inproceedings{marticorena_extending_2006, title = {Extending a Taxonomy of Bad Code Smells with Metrics}, url = {http://www.giro.infor.uva.es/Publications/2006/MLC06/}, abstract = {Bad Smells define in an informal way code flaws, in order to suggest refactorings, their aim is to improve the design of the code. Current taxonomies group code smells, making use of similarity or correlation criteria between them, and leading to a manual revision of the code. By other side, it is suggested the assistance of using metrics in the detection of bad smells. Metrics can be collected automatically helping to suggest the presence of flaws. Nevertheless, current taxonomies do not link these concepts. This work tries to establish additional criteria when we want to classify bad smells. These criteria are also related to metric features. Following the current classifications, we propose a method to evaluate the suitability of the tools assisting bad code smell detection, as well as selection and implementation of metrics linked with bad code smells.}, booktitle = {7th {ECCOP} International Workshop on {Object-Oriented} Reengineering}, author = {Marticorena, Raul and L\'{o}pez, Carlos and Crespo, Yania}, year = {2006}, keywords = {metrics, refactoring, smells}, annote = {The authors discuss the connection between metrics and code smells by adding classifiers to both, and seeing where the classifiers agree in order to determine which metrics are related to which code smells.} }, @inproceedings{tempero_how_2009, address = {Los Alamitos, {CA}, {USA}}, title = {How Fields are Used in Java: An Empirical Study}, shorttitle = {How Fields are Used in Java}, doi = {http://doi.ieeecomputersociety.org/10.1109/ASWEC.2009.19}, abstract = {The information hiding principle is generally accepted as one that if followed leads to higher quality software than if it is not followed. To follow the information hiding principle in object-oriented designs the advice is to avoid non-private fields. There is, however, little empirical evidence as to whether or not this advice is being followed. This paper presents the results of an empirical study of 100 open-source Java applications to determine to what degree non-private fields are declared, and to what extend they are used. The study indicates that it is not uncommon (albeit not that terribly common) to declare non-private fields, but then not take advantage of that access.}, booktitle = {Software Engineering Conference, Australian}, publisher = {{IEEE} Computer Society}, author = {Tempero, Ewan}, year = {2009}, keywords = {code analysis, empirical, information hiding}, pages = {91--100} }, @inproceedings{balazinska_measuring_1999, address = {Washington, {DC}, {USA}}, title = {Measuring Clone Based Reengineering Opportunities}, isbn = {0-7695-0403-5}, booktitle = {{METRICS} '99: Proceedings of the 6th International Symposium on Software Metrics}, publisher = {{IEEE} Computer Society}, author = {Balazinska, Magdalena and Merlo, Ettore and Dagenais, Michel and Lague, Bruno and Kontogiannis, Kostas}, year = {1999}, keywords = {clones, metrics}, pages = {292}, annote = {Proposes a classification system for clones based on the kind and quantity of differences. Discusses an algorithm to compute similarity between methods and to classify the relationships between clones. They run the algorithm on six Java projects, provide the data in a table, and discuss the results. Relevance: 5 } }, @article{badri_proposal_2004, title = {A Proposal of a New Class Cohesion Criterion: an Empirical Study}, volume = {3}, shorttitle = {A Proposal of a New Class Cohesion Criterion}, abstract = {Class cohesion refers to the degree of the relatedness of the members in a class. It is considered as one of most important object-oriented software attributes. Several metrics have been proposed in the literature in order to measure class cohesion in objectoriented systems. They capture class cohesion in terms of connections among members within a class. The major existing class cohesion metrics are essentially based on instance variables usage criteria. It is only a special and a restricted way of capturing class cohesion. We believe, as stated in many papers, that class cohesion should not exclusively be based on common instance variables usage criteria. We introduce, in this paper, a new criterion, which focuses on interactions between class methods. We developed a cohesion measurement tool for Java programs and performed a case study on several systems. The obtained results demonstrate that our new class cohesion metric, based on the proposed cohesion criteria, captures several pairs of related methods, which are not captured by the existing cohesion metrics.}, number = {4}, journal = {Journal of Object Technology}, author = {Badri, Linda and Badri, Mourad}, year = {2004}, keywords = {cohesion, empirical}, pages = {145--159}, annote = {Introduces the idea of cohesion not depending on attributes - methods can be interrelated. http://www.jot.fm/issues/issue\_2004\_04/article8/} }, @inproceedings{trifu_towards_2007, title = {Towards Automated Restructuring of Object Oriented Systems}, isbn = {0-7695-2802-3}, url = {http://portal.acm.org/citation.cfm?id=1252822}, abstract = {Software aging is an important cost contributor to the maintenance of aging software systems. Recent years have brought significant progress in the area of automatic detection of "code smells" as well as tool support for refactoring and implementing design patterns in the code. Nonetheless, there is hardly any tool support to help the maintainer decide how to refactor in a given situation, such that the recommended refactorings are also meaningful in that particular situation. Most of the existing techniques are either merely supporting the process, such as visualizations, or cannot guarantee meaningful refactorings, such as optimization based techniques. This paper introduces and experimentally evaluates a novel, tool supported approach to determine meaningful refactorings to structural flaws in object oriented systems. The refactorings recommended by our approach are guaranteed to lead to a meaningful and more maintainable structure in each analyzed situation. The approach contributes to a dramatic reduction of costs, by reducing the need and scope of detailed, manual code analysis.}, booktitle = {Proceedings of the 11th European Conference on Software Maintenance and Reengineering}, publisher = {{IEEE} Computer Society}, author = {Trifu, Adrian and Reupke, Urs}, year = {2007}, keywords = {refactoring, smells}, pages = {39--48} }, @inproceedings{akers_program_2004, address = {Vancouver, {BC}, {CANADA}}, title = {Program transformations for re-engineering C++ components {[OOPSLA/GPCE]}}, isbn = {1-58113-833-4}, url = {http://portal.acm.org/citation.cfm?id=1028664.1028679&coll=GUIDE&dl=GUIDE&CFID=13002976&CFTOKEN=64147361}, doi = {10.1145/1028664.1028679}, abstract = {Component-based software engineering enables applications to be assembled from component parts that adhere to a component-style specific interface specification and protocol. Components available for one style are not available for another. Component styles evolve, too, which can obsolete components using a legacy style. This creates a demand for migrating components from one style to another, which can require complex changes to the component source code. For a large component library, doing this manually is likely prohibitive. An alternative is to apply automated program transformations to carry out the changes.}, booktitle = {Companion to the 19th annual {ACM} {SIGPLAN} conference on Object-oriented programming systems, languages, and applications}, publisher = {{ACM}}, author = {Akers, Robert L. and Baxter, Ira D. and Mehlich, Michael}, year = {2004}, keywords = {abstract syntax trees, component architectures, rewrite rules, transformation}, pages = {25--26} }, @inproceedings{mishne_source_2004, title = {Source code retrieval using conceptual similarity}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.4.7753}, doi = {10.1.1.4.7753}, abstract = {We propose a method for retrieving segments of source code from a large repository. The method is based on conceptual modeling of the code, combining information extracted from the structure of the code and standard information- distance measures. Our results show an improvement over traditional retrieval models, indicating that, for this type of highly-structured documents, usage of structure is indeed beneficial for retrieval.}, booktitle = {Proc. 2004 Conf. Computer Assisted Information Retrieval {(RIAO} {\textquoteright}04)}, author = {Mishne, Gilad and De Rijke, Maarten}, year = {2004}, keywords = {graphs, kbs, ontologies, pattern matching, similarity}, pages = {539---554}, annote = {Talks about a conceptual graph matching algorithm for source code conceptual graphs. Relevance: 5} }, @article{brandes_faster_2001, title = {A faster algorithm for betweenness centrality}, volume = {25}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.11.2024}, abstract = {The betweenness centrality index is essential in the analysis of social networks, but costly to compute. Currently, the fastest known algorithms require ?(n 3) time and ?(n 2) space, where n is the number of actors in the network. Motivated by the fast-growing need to compute centrality indices on large, yet very sparse, networks, new algorithms for betweenness are introduced in this paper. They require O(n + m) space and run in O(nm) and O(nm + n 2 log n) time on unweighted and weighted networks, respectively, where m is the number of links. Experimental evidence is provided that this substantially increases the range of networks for which centrality analysis is feasible.}, number = {2}, journal = {Journal of Mathematical Sociology}, author = {Brandes, Ulrik}, year = {2001}, keywords = {betweenness, graphs, {SNA}}, pages = {163--177}, annote = {This is the algorithm referenced by {JUNG's} betweenness calculations.} }, @inproceedings{wen_effectiveness_2004, title = {An Effectiveness Measure for Software Clustering Algorithms}, isbn = {0-7695-2149-5}, url = {http://portal.acm.org/citation.cfm?id=1006833}, doi = {10.1109/WPC.2004.1311061}, abstract = {Selecting an appropriate software clustering algorithm that can help the process of understanding a large software system is a challenging issue. The effectiveness of a particular algorithm may be influenced by a number of different factors, such as the types of decompositions produced, or the way clusters are named. In this paper, we introduce an effectiveness measure for software clustering algorithms based on {MoJo} distance, and describe an algorithm that calculates its value. We also present experiments that demonstrate its improved performance over previous measures, and show how it can be usedto assess the effectiveness of software clustering algorithms.}, booktitle = {Proceedings of the 12th {IEEE} International Workshop on Program Comprehension}, publisher = {{IEEE} Computer Society}, author = {Wen, Zhihua and Tzerpos, Vassilios}, year = {2004}, keywords = {cluster evaluation, clustering, metrics, software clustering}, pages = {194--203}, annote = {Discusses a metric for computing the quality of a clustering when compared to a "gold standard." They don't seem to mention what constitutes a "good" number. They are mostly concerned with comparing their algorithm to an earlier one. Relevance: 4} }, @article{newman_structure_2003, title = {The structure and function of complex networks}, volume = {45}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.6.1737}, doi = {10.1.1.6.1737}, abstract = {Inspired by empirical studies of networked systems such as the Internet, social networks, and biological networks, researchers have in recent years developed a variety of techniques and models to help us understand or predict the behavior of these systems. Here we review developments in this field, including such concepts as the small-world effect, degree distributions, clustering, network correlations, random graph models, models of network growth and preferential attachment, and dynamical processes taking place on networks.}, journal = {{SIAM} Review}, author = {Newman, M. E. J}, year = {2003}, keywords = {graph algorithms, graphs, {SNA}}, pages = {167---256} }, @article{zaidman_automatic_2008, title = {Automatic identification of key classes in a software system using webmining techniques}, volume = {20}, url = {http://portal.acm.org/citation.cfm?id=1464380}, abstract = {Software engineers new to a project are often stuck sorting through hundreds of classes in order to find those few classes that offer a significant insight into the inner workings of the software project. To help stimulate this process, we propose a technique that can identify the most important classes in a system or the key classes of that system. Software engineers can use these classes to focus their understanding efforts when starting to work on a new software project. Those key classes are typically characterized with having a lot of 'control' within the application. In order to find these controlling classes, we present a detection approach that is based on dynamic coupling and webmining. We demonstrate the potential of our technique using two open-source software systems that have a rich documentation set. During the case studies we use dynamically gathered coupling information that vary between a number of coupling metrics. The case studies show that we are able to retrieve 90\% of the classes deemed important by the original maintainers of the systems, while maintaining a level of precision of around 50\%. Copyright {\textcopyright} 2008 John Wiley \& Sons, Ltd.}, number = {6}, journal = {J. Softw. Maint. Evol.}, author = {Zaidman, Andy and Demeyer, Serge}, year = {2008}, keywords = {dynamic analysis, program comprehension, reverse engineering}, pages = {387--417} }, @inproceedings{goener_interface_2004, address = {New York, {NY}, {USA}}, series = {{SAC} '04}, title = {Interface utilization in the Java Development Kit}, isbn = {1-58113-812-1}, location = {Nicosia, Cyprus}, doi = {10.1145/967900.968165}, abstract = {Interfaces as defined in the Java programming language can enhance both decoupling and comprehensibility of large code bases. Several researchers have pointed out this key role of interfaces in object-oriented programming, but so far only little insight as to how interfaces are actually used in practice has been made avilable. We fill this gap by applying a special metrics suite to one of the most popular pieces of software, the {JAVA} {DEVELOPMENT} {KIT}, and present interesting results.}, booktitle = {Proceedings of the 2004 {ACM} symposium on Applied computing}, publisher = {{ACM}}, author = {G\"{o}{\ss}ner, Jens and Mayer, Philip and Steimann, Friedrich}, year = {2004}, note = {{ACM} {ID:} 968165}, keywords = {design, empirical, frameworks, interfaces}, pages = {1310{\textendash}1315} }, @book{sommerville_software_1996, address = {Wokingham, England}, edition = {5th}, series = {International computer science series}, title = {Software Engineering}, isbn = {0201427656}, lccn = {{QA76.758} {.S657} 1996}, publisher = {{Addison-Wesley} Pub. Co}, author = {Sommerville, Ian}, year = {1996}, keywords = {maintenance} }, @inproceedings{gabel_scalable_2008, address = {Leipzig, Germany}, title = {Scalable detection of semantic clones}, isbn = {978-1-60558-079-1}, url = {http://portal.acm.org/citation.cfm?id=1368132}, doi = {10.1145/1368088.1368132}, abstract = {Several techniques have been developed for identifying similar code fragments in programs. These similar fragments, referred to as code clones, can be used to identify redundant code, locate bugs, or gain insight into program design. Existing scalable approaches to clone detection are limited to finding program fragments that are similar only in their contiguous syntax. Other, semantics-based approaches are more resilient to differences in syntax, such as reordered statements, related statements interleaved with other unrelated statements, or the use of semantically equivalent control structures. However, none of these techniques have scaled to real world code bases. These approaches capture semantic information from Program Dependence Graphs {(PDGs)}, program representations that encode data and control dependencies between statements and predicates. Our definition of a code clone is also based on this representation: we consider program fragments with isomorphic {PDGs} to be clones.}, booktitle = {Proceedings of the 30th international conference on Software engineering}, publisher = {{ACM}}, author = {Gabel, Mark and Jiang, Lingxiao and Su, Zhendong}, year = {2008}, keywords = {clone detection, maintenance, program dependence graph, refactoring}, pages = {321--330} }, @misc{harrison_evaluation_1998, type = {text}, title = {An Evaluation of the {MOOD} Set of {Object-Oriented} Software Metrics}, url = {http://www2.computer.org/portal/web/csdl/doi/10.1109/32.689404}, abstract = {This paper describes the results of an investigation into a set of metrics for object-oriented design, called the {MOOD} metrics. The merits of each of the six {MOOD} metrics is discussed from a measurement theory viewpoint, taking into account the recognized object-oriented features which they were intended to measure: encapsulation, inheritance, coupling, and polymorphism. Empirical data, collected from three different application domains, is then analyzed using the {MOOD} metrics, to support this theoretical validation. Results show that (with appropriate changes to remove existing problematic discontinuities) the metrics could be used to provide an overall assessment of a software system, which may be helpful to managers of software development projects. However, further empirical studies are needed before these results can be generalized.}, author = {Harrison, Rachel and Counsell, Steve and Nithi, Reuben}, month = jun, year = {1998}, keywords = {empirical, metrics}, howpublished = {http://www2.computer.org/portal/web/csdl/doi/10.1109/32.689404} }, @inproceedings{kataoka_quantitative_2002, title = {A Quantitative Evaluation of Maintainability Enhancement by Refactoring}, url = {http://www2.computer.org/portal/web/csdl/doi/10.1109/ICSM.2002.1167822}, abstract = {Program refactoring is a technique to enhance the maintainability of a program. Although the concept itself is considered to be effective, there are few quantitative evaluation of its impact to the software maintainability. It is sometimes difficult to judge whether the refactoring in question should be applied or not without knowing the effect accurately. We propose a quantitative evaluation method to measure the maintainability enhancement effect of program refactoring. We focused on the coupling metrics to evaluate the refactoring effect. By comparing the coupling before and after the refactoring, we could evaluate the degree of maintainability enhancement. We applied our method to a certain program and showed that our method was really effective to quantify the refactoring effect and helped us to choose appropriate refactorings.}, author = {Kataoka, Y. and Imai, T. and Andou, H. and Fukaya, T.}, month = oct, year = {2002}, keywords = {coupling, maintainability, maintenance, metrics, refactoring} }, @inproceedings{olbrich_are_2010, title = {Are all code smells harmful? A study of God Classes and Brain Classes in the evolution of three open source systems}, abstract = {Code smells are particular patterns in object-oriented systems that are perceived to lead to difficulties in the maintenance of such systems. It is held that to improve maintainability, code smells should be eliminated by refactoring. It is claimed that classes that are involved in certain code smells are liable to be changed more frequently and have more defects than other classes in the code. We investigated the extent to which this claim is true for God Classes and Brain Classes, with and without normalizing the effects with respect to the class size. We analyzed historical data from 7 to 10 years of the development of three open-source software systems. The results show that God and Brain Classes were changed more frequently and contained more defects than other kinds of class. However, when we normalized the measured effects with respect to size, then God and Brain Classes were less subject to change and had fewer defects than other classes. Hence, under the assumption that God and Brain Classes contain on average as much functionality per line of code as other classes, the presence of God and Brain Classes is not necessarily harmful; in fact, such classes may be an efficient way of organizing code.}, booktitle = {2010 {IEEE} International Conference on Software Maintenance {(ICSM)}}, author = {Olbrich, S. M and Cruzes, D. S and Sjoberg, D. I. K.}, year = {2010}, keywords = {maintainability, maintenance, size, smells}, pages = {1{\textendash}10} }, @article{bieman_measuring_1998, title = {Measuring {Design-Level} Cohesion}, volume = {24}, issn = {0098-5589}, url = {http://dx.doi.org/10.1109/32.666825}, doi = {http://dx.doi.org/10.1109/32.666825}, abstract = {Cohesion was first introduced as a software attribute that, when measured, could be used to predict properties of implementations that would be created from a given design. Unfortunately, cohesion, as originally defined, could not be objectively assessed, while more recently developed objective cohesion measures depend on code-level information. We show that association-based and slice-based approaches can be used to measure cohesion using only design-level information. An analytical and empirical analysis shows that the design-level measures correspond closely with code-level cohesion measures. They can be used as predictors of or surrogates for the code-level measures. The design-level cohesion measures are formally defined, have been implemented, and can support software design, maintenance, and restructuring.}, journal = {{IEEE} Transactions on Software Engineering}, author = {Bieman, James M and Kang, {Byung-Kyoo}}, month = feb, year = {1998}, note = {{ACM} {ID:} 631219}, keywords = {algorithms, cohesion, complexity measures, design, metrics}, pages = {111{\textendash}124} }, @book{gomez-perez_ontological_2004, address = {London}, series = {Advanced information and knowledge processing}, title = {Ontological Engineering: With Examples from the Areas of Knowledge Management, {E-Commerce} and the Semantic Web}, isbn = {1852335513}, lccn = {{QA76.76.E95} G65 2004}, shorttitle = {Ontological Engineering}, publisher = {Springer}, author = {{Go?mez-Pe?rez}, Asuncio?n and {Ferna?ndez-Lo?pez}, Mariano and Corcho, Oscar}, year = {2004}, keywords = {kbs, semantic web}, annote = {This is a modern book that explains the state of ontological research. It does a good job of finding a middle ground among the different views of various researchers. It covers traditional frame systems as well as semantic web approaches and discusses the role of formal logic. It also covers various representation languages, methodologies, and tools. Relevance: 5} }, @book{watson_java_2004, title = {Java Programming 10-minute Solutions}, isbn = {0782142850, 9780782142853}, author = {Watson, Mark}, year = {2004}, annote = {Page 290 says a {POJO} "typically just defines private data and public {GET/SET} methods." This discussion is in the context of Hibernate and Java Domain Objects {(JDOs).} Relevance: 2} }, @inproceedings{yang_measuring_2007, address = {Melbourne, Australia}, title = {Measuring the Strength of Indirect Coupling}, isbn = {1530-0803}, abstract = {It is widely accepted that coupling plays an important role in software quality, particularly in the areas of software maintenance, so effort should be made to keep coupling levels to a minimum in order to reduce the complexity of the system. We have previously introduced the concept of "indirect" coupling - coupling formed by relationships/dependencies that are not directly evident - with the belief that high levels of indirect coupling can constitute greater costs to maintenance as it is harder to detect. In this paper we extend our previous studies by proposing metrics that can advance our understanding of the exact relationship between indirect coupling and maintainability. In particulars the metrics focus on the reflection of "strength" as it is a fundamental component of coupling. We present our observations on the results of applying the metrics to existing Java applications.}, booktitle = {Proceedings of 2007 Australian Software Engineering Conference {(ASWEC'07)}}, author = {Yang, Hong Yul and Tempero, Ewan}, year = {2007}, keywords = {coupling, indirect coupling, maintenance, metrics}, pages = {319--328}, annote = {Has a good discussion about validating results, including pointers to metric properties E. J. Weyuker. Evaluating software complexity measures. {IEEE} Transactions on Software Engineering, 14(9):1357{\textendash} 1365, September 1988. and critiques N. Fenton. Software measurement: A necessary scientific basis. {IEEE} Transactions on Software Engineering, 20(3):199{\textendash}206, March 1994.} }, @inproceedings{antoniol_identifying_2010, address = {Timisoara, Romania}, title = {Identifying Extract Class Opportunities through Game Theory}, abstract = {In software engineering, developers must often find solutions to problems while balancing competing goals, e.g., quality versus cost, time to market versus resources or cohesion versus coupling. Finding suitable compromises between contrasting goals is often complex and recommendation systems are useful to support developers and managers in performing such a complex task. We believe contrasting goals can be often dealt with game theory techniques. Indeed, game theory is successfully used in other fields, especially in economics, to mathematically propose solutions to strategic situation, in which an individual's success in making choices depends on the choices of others. To demonstrate the applicability of game theory to software engineering and to understand its pros and cons, we propose an approach based on game theory that recommend extract-class refactoring opportunities. A preliminary evaluation inspired by mutation testing demonstrates the applicability and the benefits of the proposed approach.}, booktitle = {Proceedings of the 26th {IEEE} International Conference on Software Maintenance}, author = {Antoniol, G. and Bavota, G. and De Lucia, A. and Gueheneuc, Y. and Oliveto, R.}, year = {2010}, keywords = {extract class, refactoring} }, @article{briand_unified_1999, title = {A unified framework for coupling measurement in object-oriented Systems}, volume = {25}, url = {http://portal.acm.org/citation.cfm?id=297728}, abstract = {The increasing importance being placed on software measurement has led to an increased amount of research developing new software measures. Given the importance of object-oriented development techniques, one specific area where this has occurred is coupling measurement in object-oriented systems. However, despite a very interesting and rich body of work, there is little understanding of the motivation and empirical hypotheses behind many of these new measures. It is often difficult to determine how such measures relate to one another and for which application they can be used. As a consequence, it is very difficult for practitioners and researchers to obtain a clear picture of the state-of-the-art in order to select or define measures for object-oriented {systems.This} situation is addressed and clarified through several different activities. First, a standardized terminology and formalism for expressing measures is provided which ensures that all measures using it are expressed in a fully consistent and operational manner. Second, to provide a structured synthesis, a review of the existing frameworks and measures for coupling measurement in object-oriented systems takes place. Third, a unified framework, based on the issues discovered in the review, is provided and all existing measures are then classified according to this {framework.This} paper contributes to an increased understanding of the state-of-the-art: A mechanism is provided for comparing measures and their potential use, integrating existing measures which examine the same concepts in different ways, and facilitating more rigorous decision making regarding the definition of new measures and the selection of existing measures for a specific goal of measurement. In addition, our review of the state-of-the-art highlights that many measures are not defined in a fully operational form, and relatively few of them are based on explicit empirical models, as recommended by measurement theory.}, number = {1}, journal = {{IEEE} Transactions on Software Engineering}, author = {Briand, Lionel C. and Daly, John W. and W\"{u}st, J\"{u}rgen K.}, year = {1999}, keywords = {coupling, metrics, survey}, pages = {91--121}, annote = {A great overview and analysis of {OO} coupling. Dated but still relevant. Relevance: 5} }, @techreport{emam_confounding_1999, title = {The Confounding Effect of Class Size on the Validity of Object-oriented Metrics {\textbar} Publications {\textbar} {NRC-IIT}}, url = {http://iit-iti.nrc-cnrc.gc.ca/publications/nrc-43606_e.html}, abstract = {Much effort has been devoted to the development and empirical validation of object-oriented metrics. The empirical validations performed thus far would suggest that a core set of validated metrics is close to being identified. However, none of these studies control for the potentially confounding effect of class size. In this paper we show a strong size confounding effect, and question the results of previous object-oriented validation studies. We first investigated whether there is a confounding effect of class size in validation studies of object-oriented metrics and show that based on previous work there is reason to believe that such an effect exists. We then describe a detailed empirical methodology for identifying those effects. Finally, we perform a study on a large C++ telecommunications framework to examine if size is really a confounder. This study considered the Chidamber and Kemerer metrics, and a subset of the Lorenz and Kidd metrics. The dependent variable was the incidence of a fault attributable to a field failure (fault-proneness of a class). Our findings indicate that before controlling for size, the results are very similar to previous studies: the metrics that are expected to be validated are indeed associated with fault-proneness. After controlling for size none of the metrics we studied were associated with fault-proneness anymore. This demonstrates a strong size confounding effect, and casts doubt on the results of previous object-oriented metrics validation studies. It is recommended that previous validation studies be re-examined to determine whether their conclusions would still hold after controlling for size, and that future validation studies should always control for size.}, number = {{NRC} 43606}, institution = {National Research Council Canada}, author = {Emam, Khaled El and Benlarbi, Sa\"{i}da and Goel, Nishith}, year = {1999}, keywords = {coupling, metrics, size}, pages = {38} }, @article{okeeffe_search-based_2008, title = {Search-based refactoring for software maintenance}, volume = {81}, issn = {0164-1212}, url = {http://www.sciencedirect.com/science/article/B6V0N-4NYJ0M6-1/2/7a5e31ebe95593512fdef756fddfa579}, doi = {10.1016/j.jss.2007.06.003}, abstract = {The high cost of software maintenance could be reduced by automatically improving the design of object-oriented programs without altering their behaviour. We have constructed a software tool capable of refactoring object-oriented programs to conform more closely to a given design quality model, by formulating the task as a search problem in the space of alternative designs. This novel approach is validated by two case studies, where programs are automatically refactored to increase flexibility, reusability and understandability as defined by a contemporary quality model. Both local and simulated annealing searches were found to be effective in this task.}, number = {4}, journal = {Journal of Systems and Software}, author = {{O'Keeffe}, Mark and \'{O} Cinn\'{e}ide, Mel}, month = apr, year = {2008}, keywords = {refactoring, search}, pages = {502--516} }, @inproceedings{adar_softguess:_2007, address = {Washington, {DC}, {USA}}, title = {{SoftGUESS:} Visualization and Exploration of Code Clones in Context}, isbn = {0-7695-2828-7}, doi = {http://dx.doi.org/10.1109/ICSE.2007.76}, booktitle = {{ICSE} '07: Proceedings of the 29th international conference on Software Engineering}, publisher = {{IEEE} Computer Society}, author = {Adar, Eytan and Kim, Miryung}, year = {2007}, keywords = {clones, visualization}, pages = {762{\textemdash}766}, annote = {Discusses the visualization of the evolution of clones over time (new versions). Relevance: 2 } }, @article{mitchell_automatic_2006, title = {On the automatic modularization of software systems using the Bunch tool}, volume = {32}, issn = {0098-5589}, doi = {10.1109/TSE.2006.31}, abstract = {Since modern software systems are large and complex, appropriate abstractions of their structure are needed to make them more understandable and, thus, easier to maintain. Software clustering techniques are useful to support the creation of these abstractions by producing architectural-level views of a system's structure directly from its source code. This paper examines the Bunch clustering system which, unlike other software clustering tools, uses search techniques to perform clustering. Bunch produces a subsystem decomposition by partitioning a graph of the entities (e.g., classes) and relations (e.g., function calls) in the source code. Bunch uses a fitness function to evaluate the quality of graph partitions and uses search algorithms to find a satisfactory solution. This paper presents a case study to demonstrate how Bunch can be used to create views of the structure of significant software systems. This paper also outlines research to evaluate the software clustering results produced by Bunch.}, number = {3}, journal = {{IEEE} Transactions on Software Engineering}, author = {Mitchell, {B.S.} and Mancoridis, S.}, year = {2006}, keywords = {automatic modularization, clustering, graph partition, maintainability, maintenance, program comprehension, reengineering, reverse engineering, search, software clustering}, pages = {193--208} }, @book{gamma_design_1994, address = {Boston}, title = {Design Patterns: Elements of Reusable {Object-Oriented} Software}, isbn = {0201633612}, shorttitle = {Design Patterns}, publisher = {{Addison-Wesley} Professional}, author = {Gamma, Erich and Helm, Richard and Johnson, Ralph and Vlissides, John M.}, month = nov, year = {1994}, keywords = {design patterns, {OOD}} }, @inproceedings{de_moor_.ql:_2007, address = {Braga, Portugal}, title = {{.QL:} {Object-Oriented} Queries Made Easy}, abstract = {These notes are an introduction to {.QL}, an object-oriented query language for any type of structured data. We illustrate the use of {.QL} in assessing software quality, namely to find bugs, to compute metrics and to enforce coding conventions. The class mechanism of {.QL} is discussed in depth, and we demonstrate how it can be used to build libraries of reusable queries.}, booktitle = {Generative and Transformational Techniques for Software Engineering}, author = {de Moor, Oege and Sereni, Damien and Verbaere, Mathieu and Hajiyev, Elnar and Avgustinov, Pavel and Ekman, Torbj\"{o}rn and Ongkingco, Neil and Tibble, Julian}, month = jul, year = {2007}, keywords = {metrics, query language}, annote = {In addition to lots of examples of queries and explanations of them, it also reviews related work. This is the lanuage used by {SemmleCode.} Outline: 1 Introduction 2 Program Queries 2.1 A Simple Query 2.2 Methods 2.3 Sets of Results 2.4 Casts 2.5 Chaining 2.6 Aggregates 3 {Object-Oriented} Queries 3.1 Motivating Examples 3.2 Generic Queries 3.3 Inheritance and method dispatch 3.4 Database Schema 3.5 From Primitives to Classes 4 Related Work 4.1 Code Queries 4.2 Object-oriented Query Languages 5 Conclusion Relevance: 5 } }, @incollection{briand_empirical_2002, title = {Empirical studies of quality models in object-oriented systems}, volume = {56}, isbn = {0065-2458}, url = {http://www.sciencedirect.com/science/article/B7RNF-4ND0R1J-5/2/9e772645aeffffbb51c73d7a1defd821}, abstract = {Measuring structural design properties of a software system, such as coupling, cohesion, or complexity, is a promising approach toward early quality assessments. To use such measurement effectively, quality models that quantitatively describe how these internal structural properties relate to relevant external system qualities such as reliability or maintainability are needed. This chapter's objective is to summarize, in a structured and detailed fashion, the empirical results reported so far with modeling external system quality based on structural design properties in object-oriented systems. We perform a critical review of existing work in order to identify lessons learned regarding the way these studies are performed and reported. Constructive guidelines for facilitating the work of future studies are also provided, thus facilitating the development of an empirical body of knowledge.}, booktitle = {Advances in Computers}, publisher = {Elsevier}, author = {Briand, Lionel C. and W\"{u}st, J\"{u}rgen and Marvin V. Zelkowitz}, year = {2002}, keywords = {empirical, metrics, metrics validation}, pages = {97--166}, annote = {"results show that overall, cohesion measures appear to have no significant relationship to fault-proneness" } }, @article{tsantalis_identification_2010, title = {Identification of refactoring opportunities introducing polymorphism}, volume = {83}, issn = {0164-1212}, url = {http://www.sciencedirect.com/science/article/B6V0N-4X6MSR5-2/2/1cd9c725d91737926009f425afc57f61}, doi = {10.1016/j.jss.2009.09.017}, abstract = {Polymorphism is one of the most important features offered by object-oriented programming languages, since it allows to extend/modify the behavior of a class without altering its source code, in accordance to the {Open/Closed} Principle. However, there is a lack of methods and tools for the identification of places in the code of an existing system that could benefit from the employment of polymorphism. In this paper we propose a technique that extracts refactoring suggestions introducing polymorphism. The approach ensures the behavior preservation of the code and the applicability of the refactoring suggestions based on the examination of a set of preconditions.}, number = {3}, journal = {Journal of Systems and Software}, author = {Tsantalis, Nikolaos and Chatzigeorgiou, Alexander}, month = mar, year = {2010}, keywords = {refactoring}, pages = {391--404} }, @article{watanabe_information_1960, title = {Information theoretical analysis of multivariate correlation}, volume = {4}, abstract = {Abstract: A set X of stochastic variables, yl, yz, . . . , y,,, is grouped into subsets, pl, {pZr} . . . , pk. The correlation existing in X with respect to the p{\textquoteright}s is adequately expressed by C= {S(p{\textasciitilde}()-S(h)20w},h ere S(v) is the entropy function defined with reference to the variables y in subset V. For a given X, C becomes maximum when each pi consists of only one variable, (n=k). The value Cis then called fhe fofal correlation in X, {CtOt(X).} The present paper gives various theorems, according to which {CtOt(M} can be decomposed in terms of the partial correlations existing in subsets of X, and of quantities derivable therefrom. The information-theoretical meaning of each decomposition is carefully explained. As illustrations, two problems are discussed at the end of the paper: (1) redundancy in geometrical figures in pattern recognition, and (2) randomization effect of shuffling cards marked {\textquotedblright}zero{\textquotedblright} or {\textquotedblleft}one!{\textquoteright}}, number = {1}, journal = {{IBM} Journal of Research and Development}, author = {Watanabe, S}, year = {1960}, keywords = {cohesion, metrics}, pages = {6682} }, @article{hatton_does_1998, title = {Does {OO} sync with how we think?}, volume = {15}, issn = {0740-7459}, abstract = {Is object orientation an imperfect paradigm for reliable coding? Worse, does it focus on the wrong part of the life cycle? The author thinks so and explains why. Given that corrective-maintenance costs already dominate the software life cycle and look set to increase significantly, the author argues that reliability in the form of reducing such costs is the most important software improvement goal. Yet, the results are not promising when we review recent corrective-maintenance data for big systems in general and for {OO} systems, in this case written in C++. The author asserts that any paradigm that is capable of decomposing a system into large numbers of small components-as frequently occurs in both {OO} and conventional systems-is fundamentally wrong. Thus, because both paradigms suffer from this flaw, we should expect no particular benefits to accrue from an {OO} system over a {non-OO} system. Further, a detailed comparison of {OO} programming and the human thought processes involved in short and long term memory suggests that {OO} aligns with human thinking limitations indifferently at best. In the case studies described, {OO} is no more than a different paradigm, and emphatically not a better one, although it is not possible to apportion blame between the {OO} paradigm itself and its C++ implementation}, number = {3}, journal = {Software, {IEEE}}, author = {Hatton, L.}, year = {1998}, keywords = {human factors, maintenance, {OOP}}, pages = {46--54} }, @article{tsantalis_design_2006, title = {Design Pattern Detection Using Similarity Scoring}, volume = {32}, url = {http://portal.acm.org/citation.cfm?id=1248777}, abstract = {The identification of design patterns as part of the reengineering process can convey important information to the designer. However, existing pattern detection methodologies generally have problems in dealing with one or more of the following issues: Identification of modified pattern versions, search space explosion for large systems and extensibility to novel patterns. In this paper, a design pattern detection methodology is proposed that is based on similarity scoring between graph vertices. Due to the nature of the underlying graph algorithm, this approach has the ability to also recognize patterns that are modified from their standard representation. Moreover, the approach exploits the fact that patterns reside in one or more inheritance hierarchies, reducing the size of the graphs to which the algorithm is applied. Finally, the algorithm does not rely on any pattern-specific heuristic, facilitating the extension to novel design structures. Evaluation on three open-source projects demonstrated the accuracy and the efficiency of the proposed method.}, number = {11}, journal = {{IEEE} Trans. Softw. Eng.}, author = {Tsantalis, Nikolaos and Halkidis, Spyros T.}, year = {2006}, keywords = {{OOD}, patterns, restructuring, reverse engineering}, pages = {896--909}, annote = {Member - Alexander Chatzigeorgiou}, annote = {Member - George Stephanides} }, @inproceedings{al_dallal_improving_2011, address = {Cambridge, {UK}}, title = {Improving object-oriented lack-of-cohesion metric by excluding special methods}, abstract = {Classes are the basic units in object-oriented programs, and therefore, their quality has impact on the overall quality of the software. Class cohesion is a key quality factor, and it refers to the degree of relatedness of class attributes and methods. Software developers use class cohesion measure to assess the quality of their products and to guide the restructuring of poorly designed classes. Several class cohesion metrics are proposed in the literature, and the impact of considering the special methods (i.e., constructors, destructors, and access methods) in cohesion calculation is not empirically studied for most of them. In this paper, we address this issue for one of the most popular class cohesion metrics, referenced as Lack of Cohesion {(LCOM).} Our empirical study involves applying the metric with and without considering special methods on classes of two open source Java applications and statistically analyzing the results. The empirical study results show that the ability of {LCOM} in indicating class quality slightly improves when excluding special methods from the {LCOM} computation.}, booktitle = {Proceedings of the 10th {WSEAS} International Conference on Software Engineering, Parallel and Distributed Systems}, author = {Al Dallal, J.}, year = {2011}, keywords = {cohesion, empirical}, pages = {124--129} }, @article{henderson-sellers_coupling_1996, title = {Coupling and cohesion (towards a valid metrics suite for object-oriented analysis and design)}, volume = {3}, journal = {Object Oriented Systems}, author = {{Henderson-Sellers}, Brian and Constantine, Larry L. and Graham, Ian M.}, year = {1996}, keywords = {cohesion, coupling, metrics}, pages = {143--158} }, @phdthesis{prinz_graph_2006, type = {Master's thesis}, title = {The Graph Visualization System {(GVS)} A Flexible Java Framework for Graph Drawing}, abstract = {A graph describes relationships between entities and is usually represented by a set of nodes (entities) and a set of edges (relations) between the nodes. Metadata such as labels or weights are often associated with the elements of a graph. The field of graph drawing, part of the wider field of information visualization, seeks to visualize the abstract information contained within a graph for the human observer. A drawing of a graph is a graphical image, in two or three dimensions, which reflects the graph{\textquoteright}s topology and characteristics as closely as possible. Applications of graph drawing include social network and web site visualization, transportation network maps, and document cluster analysis. The Graph Visualization System {(GVS)} is a modular, flexible, and extensible framework for graph drawing implemented in Java. {GVS} provides implementations of some of the standard layered and force-directed graph drawing techniques. In addition, {GVS} is designed to be used as a demonstrator tool when teaching graph drawing methods. To this end, each of the implemented algorithms is divided into its constituent parts, which can be stepped through (and undone) interactively.}, school = {Institute for Information Systems and Computer Media {(IICM)}, Graz University of Technology}, author = {Prinz, Wolfgang}, month = mar, year = {2006}, keywords = {graphs, visualization}, annote = {For my purposes, this is useful primarily for its review of graphing tools and their capabilities. Relevance: 3} }, @book{shavor_javatm_2003, title = {The {Java(TM)} Developer's Guide to Eclipse}, isbn = {0321159640}, publisher = {{Addison-Wesley} Professional}, author = {Shavor, Sherry and {D'Anjou}, Jim and Fairbrother, Scott and Kehn, Dan and Kellerman, John and {McCarthy}, Pat}, month = may, year = {2003}, keywords = {Eclipse} }, @inproceedings{ekman_refactoring_2008, title = {Refactoring is not (yet) about transformation}, abstract = {In order to ensure correctness, refactorings have to check extensive preconditions before performing the transformation. These preconditions usually involve subtle analyses of the program to be refactored, and as long as there is no good support for implementing them, refactoring is not about transformation, but about analysis. In most cases, these refactoring analyses are very similar to analyses implemented in a compiler and require the same level of detail to ensure behaviour preservation. We therefore propose to implement a refactoring engine on top of a compiler to leverage existing infrastructure, and complement it with refactoring-specific functionality. Many simple refactorings appear as building blocks in more complex refactorings. We have implemented two such building blocks that are widely useful: The first one allows to move symbolic names from one place in the program to another while preserving binding structure; it frees the developer from having to worry about issues like name clashes and accidental overriding. The second building block encapsulates data flow and control flow analyses, enabling the developer to specify precise conditions for validity of a transformation in terms of concepts like dominance and liveness. Based on these approaches, we have implemented a refactoring engine as part of a larger effort to generate {IDEs} from declarative language specifications using the {JastAdd} metacompiler tools. The described building blocks were successfully used as a foundation for other refactorings such as Rename, Extract Method, and Encapsulate Field.}, booktitle = {Second Workshop on Refactoring Tools {(WRT)}}, author = {Ekman, T. and Sch\"{a}fer, M. and Verbaere, M.}, year = {2008}, keywords = {refactoring, refactoring engines, transformation} }, @inproceedings{heitlager_practical_2007, title = {A Practical Model for Measuring Maintainability}, abstract = {The amount of effort needed to maintain a software system is related to the technical quality of the source code of that system. The {ISO} 9126 model for software product quality recognizes maintainability as one of the 6 main characteristics of software product quality, with adaptability, changeability, stability, and testability as subcharacteristics of maintainability. Remarkably, {ISO} 9126 does not provide a consensual set of measures for estimating maintainability on the basis of a system's source code. On the other hand, the maintainability index has been proposed to calculate a single number that expresses the maintainability of a system. In this paper, we discuss several problems with the {MI}, and we identify a number of requirements to be fulfilled by a maintainability model to be usable in practice. We sketch a new maintainability model that alleviates most of these problems, and we discuss our experiences with using such as system for {IT} management consultancy activities.}, booktitle = {6th International Conference on the Quality of Information and Communications Technology, 2007. {QUATIC} 2007.}, author = {Heitlager, I. and Kuipers, T. and Visser, J.}, year = {2007}, keywords = {{ISO} 9126 model, maintainability, maintenance}, pages = {30--39} }, @inproceedings{mcdowell_rlf_1989, title = {The {RLF} Librarian: A Reusability Librarian Based on Cooperating {Knowledge-Based} Systems}, shorttitle = {The {RLF} Librarian}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.56.7975}, abstract = {Knowledge-based techniques provide tremendous leverage for managing and using repositories of reusable software. The Unisys Reusability Library Framework demonstrates this by providing an intelligent Librarian application that operates on a model of a software domain. The Librarian uses structured inheritance networks to model the library domain and organize the repository contents. The resulting structure of the domain model allows convenient repository browsing and component retrieval and provides a foundation for a variety of powerful tools. Rule-based inferencers complement this structure by making heuristic knowledge and advice available to the user. This makes the domain knowledge and repository organization more accessible to the user. The {RLF} uses a hybridization of these two knowledge representation paradigms, taking advantage of the strengths of each to provide intelligent assistance in the use and management of a software repository.}, booktitle = {Proceedings of the 4th Annual Rome Air Development Center {Knowledge-Based} Software Assistant Conference}, author = {{McDowell}, Raymond C and Cassell, Keith A}, month = sep, year = {1989}, keywords = {kbs, reusability, semantic networks} }, @inproceedings{etzkorn_towards_2000, title = {Towards a semantic metrics suite for object-oriented design}, isbn = {0-7695-0774-3}, url = {http://portal.acm.org/citation.cfm?id=833277}, abstract = {In recent years, much work has been performed in developing suites of metrics that are targeted for object-oriented software, rather than functionally oriented software. This is necessary since good object-oriented software has several characteristics, such as inheritance and polymorphism that are not usually present in functionally oriented software. However, all of these object-oriented metrics suites have been defined using only syntactic aspects of object-oriented software; indeed, the earlier functionally-oriented metrics were also calculated using only syntactic information. All syntactically oriented metrics have the problem that the mapping from the metric to the quality the metric purports to measure, such as the software quality factor {\textquestiondown}cohesion,{\textquestiondown} is indirect, and often arguable. Thus, a substantial amount of research effort goes into proving that these syntactically oriented metrics actually do measure their associated quality {factors.This} paper introduces a new suite of semantically derived object-oriented metrics, which provide a more direct mapping from the metric to its associated quality factor than is possible using syntactic metrics. These semantically derived metrics are calculated using knowledge-based, program understanding, and natural language processing techniques.}, booktitle = {Proceedings of the Technology of {Object-Oriented} Languages and Systems}, publisher = {{IEEE} Computer Society}, author = {Etzkorn, Letha and Delugach, Harry}, year = {2000}, keywords = {conceptual graphs, kbs, metrics, natural language processing, program comprehension, semantic networks, semantics}, pages = {71} }, @article{sutherland_business_1995, title = {Business objects in corporate information systems}, volume = {27}, url = {http://portal.acm.org/citation.cfm?id=210376.210394}, doi = {10.1145/210376.210394}, number = {2}, journal = {{ACM} Computing Surveys}, author = {Sutherland, Jeff}, year = {1995}, keywords = {maintenance}, pages = {274--276}, annote = {Includes dollar numbers about maintenance costs.} }, @inproceedings{simon_distance_1999, address = {Amsterdam, The Netherlands}, title = {Distance Based Cohesion Measuring}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.46.7818}, abstract = {Abstract: The design principle {\textquotedblleft}put together what belongs together{\textquotedblright} is one major principle in software engineering. There are as many criteria that guide this grouping as there are grouping possibilities of the system. Their extraction for reverse engineering should be tool supported. One instrument for their implementation are distance measures. After defining some theoretical concepts and general aspects of cohesion, a distance measurement framework is presented. With it a generic cohesion concept is introduced, that is applicable to different abstraction levels of software. Some typical applications of it for object oriented source code are presented. Because distances are well defined there exist many visualisations that are easy to insert into the distance measurement framework and can provide a powerful support during reverse engineering.}, booktitle = {{IN} {PROCEEDINGS} {OF} {THE} {2ND} {EUROPEAN} {SOFTWARE} {MEASUREMENT} {CONFERENCE} {(FESMA)}, {PAGES} 69{\textendash}83, {TECHNOLOGISCH} {INSTITUUT}}, author = {Simon, Frank and L\"{o}ffler, Silvio and Lewerentz, Claus}, year = {1999}, keywords = {cohesion, distance metric, metrics, similarity}, pages = {6983} }, @inproceedings{ma_usage_2006, address = {Los Alamitos, {CA}, {USA}}, title = {Usage Patterns of the Java Standard {API}}, doi = {http://doi.ieeecomputersociety.org/10.1109/APSEC.2006.60}, abstract = {The Java Standard {API} has grown enormously since Java?s beginnings, now consisting of over 3,000 classes and 20,000 methods. The intent of this {API} is to provide high quality components that can be easily reused and so increase the Java developer?s productivity but does it? In this paper, we present a study that begins to answer this question. Specifically we take a corpus-based approach to help determine the "typical" usage of the Standard {API.} We find that, in an extensive corpus of open-source software, only about 50\% of the classes in the Standard {API} are used at all, and around 21\% of the methods are used. We discuss the implications this has for future development of both the {API} itself, and for tools to support the {API.}}, booktitle = {{Asia-Pacific} Software Engineering Conference}, publisher = {{IEEE} Computer Society}, author = {Ma, Homan and Amor, Robert and Tempero, Ewan}, year = {2006}, pages = {342--352} }, @inproceedings{mealy_improving_2007, title = {Improving Usability of Software Refactoring Tools}, isbn = {1530-0803}, abstract = {Post-deployment maintenance and evolution can account for up to 75\% of the cost of developing a software system. Software refactoring can reduce the costs associated with evolution by improving system quality. Although refactoring can yield benefits, the process includes potentially complex, error-prone, tedious and time-consuming tasks. It is these tasks that automated refactoring tools seek to address. However, although the refactoring process is well-defined, current refactoring tools do not support the full process. To develop better automated refactoring support, we have completed a usability study of software refactoring tools. In the study, we analysed the task of software refactoring using the {ISO} 9241-11 usability standard and Fitts' List of task allocation. Expanding on this analysis, we reviewed 11 collections of usability guidelines and combined these into a single list of 38 guidelines. From this list, we developed 81 usability requirements for refactoring tools. Using these requirements, the software refactoring tools Eclipse 3.2, Condenser 1.05, {RefactorIT} 2.5.1, and Eclipse 3.2 with the Simian {UI} 2.2.12 plugin were studied. Based on the analysis, we have selected a subset of the requirements that can be incorporated into a prototype refactoring tool intended to address the full refactoring process.}, booktitle = {Software Engineering Conference, 2007. {ASWEC} 2007. 18th Australian}, author = {Mealy, E. and Carrington, D. and Strooper, P. and Wyeth, P.}, year = {2007}, keywords = {maintenance, refactoring}, pages = {307--318}, annote = {Usability issues and guidelines for refactoring tools. Relevance: 3} }, @article{marcus_using_2008, title = {Using the conceptual cohesion of classes for fault prediction in object-oriented systems}, volume = {34}, abstract = {High cohesion is a desirable property of software, as it positively impacts understanding, reuse, and maintenance. Currently proposed measures for cohesion in {Object-Oriented} {(OO)} software reflect particular interpretations of cohesion and capture different aspects of cohesion. The paper proposes a new measure for the cohesion of classes in an {OO} software system, based on the analysis of the unstructured information embedded in the source code, such as comments and identifiers. The measure, named the Conceptual Cohesion of Classes {(C3)}, is inspired from the mechanisms used to measure textual coherence in cognitive psychology and computational linguistics. The paper presents the principles and the technology that stand behind the C3 measure. A large case study on three open source software systems is presented, which compares the new measure with an extensive set of existing metrics and uses them to construct models that predict software faults. The case study shows that the novel measure captures different aspects of class cohesion compared to any of the existing cohesion measures. In addition, combining C3 with existing structural cohesion metrics proves to be a better predictor of faulty classes when compared to different combinations of structural cohesion metrics.}, number = {2}, journal = {{IEEE} Trans. Softw. Eng.}, author = {Marcus, Andrian and Poshyvanyk, Denys and Ferenc, Rudolf}, year = {2008}, keywords = {cohesion, comparative study, maintainability, metrics, restructuring, reverse engineering, semantics}, pages = {287--300}, annote = {Contains a real good summary of the cohesion world. Also does a comparative study of various structural metrics. The "documents" being compared semantically are the methods, including associated comments. the terms include the parsed parts of identifiers. Relevance: 5 } }, @book{ieee_ieee_1990, address = {New York, New York}, title = {{IEEE} Standard Glossary of Software Engineering Terminology}, volume = {121990}, isbn = {{1-55937-067-X}}, abstract = {{IEEE} Std 610.12-1990, {IEEE} Standard Glossary of Software Engineering Terminology, identifies terms currently in use in the field of Software Engineering. Standard definitions for those terms are established.}, author = {{IEEE}}, month = sep, year = {1990} }, @inproceedings{newcomb_abstract_2005, address = {Washington, {DC}, {USA}}, title = {Abstract Syntax Tree Metamodel Standard, {ASTM} Tutorial 1.0}, author = {Newcomb, Philip}, month = oct, year = {2005}, keywords = {abstract syntax trees}, annote = {Discusses a series of proposed {OMG} modeling standards and how they inter-relate - Knowledge Discovery {(RFP1)}, Abstract Syntax Trees {(RFP} 2), Analysis {(RFP} 3), Metrics, {(RFP} 4), Visualization {(RFP} 5), Refactoring {(RFP} 6), Target Mapping \&, Transformation, {(RFP} 7) Relevance: 5 } }, @inproceedings{van_deursen_identifying_1999, title = {Identifying Objects using Cluster and Concept Analysis}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.36.2469}, abstract = {Many approaches to support (semi-automatic) identification of objects in legacy code take the data structures as starting point for candidate classes. Unfortunately, legacy data structures tend to grow over time, and may contain many unrelated fields at the time of migration. We propose a method for identifying objects by semi-automatically restructuring the legacy data structures. Issues involved include the selection of record fields of interest, the identification of procedures actually dealing with such fields, and the construction of coherent groups of fields and procedures into candidate classes. We explore the use of cluster and concept analysis for the purpose of object identification, and we illustrate their effect on a 100,000 {LOC} Cobol system. Furthermore, we use these results to contrast clustering with concept analysis techniques.}, booktitle = {21st International Conference on Software Engineering}, author = {Van Deursen, Arie and Kuipers, Tobias}, year = {1999}, keywords = {clustering, {FCA}, software clustering}, pages = {246---255} }, @article{yu_ontology_2008, title = {Ontology {Model-Based} Static Analysis on Java Programs}, issn = {0730-3157}, doi = {http://doi.ieeecomputersociety.org/10.1109/COMPSAC.2008.73}, abstract = {Typical enterprise and military software systems consist of millions of lines of code with complicated dependence on diverse library abstractions. Manually debugging these codes imposes developers overwhelming workload and difficulties. To address software quality concerns efficiently, this paper proposes an ontology-based static analysis approach to automatically detect bugs in the source code of Java programs. First, we elaborate bug list collected, classify bugs into different categories, and translate bug patterns into {SWRL} {(Semantic} Web Rule Language) rules using an ontology tool, Prot\'{e}g\'{e}. An ontology model of Java program is created according to Java program specification using Prot\'{e}g\'{e} as well. Both {SWRL} rules and the program ontology model are exported in {OWL} {(Web} Ontology Language) format. Second, Java source code under analysis is parsed into the Abstract Syntax Tree {(AST)}, which is automatically mapped to the individuals of the program ontology model. {SWRL} Bridge takes in the exported {OWL} file (representing the {SWRL} rules model and program ontology model) and the individuals created for the Java code, conduits to Jess (a rule engine), and obtains inference results indicating any bugs. We perform experiments to compare bug detection capability with well-known {FindBugs} tool. A prototype of bug detector tool is developed to show the validity of the proposed static analysis approach.}, journal = {Computer Software and Applications Conference, Annual International}, author = {Yu, Lian and Zhou, Jun and Yi, Yue and Li, Ping and Wang, Qianxiang}, year = {2008}, keywords = {abstract syntax trees, ontologies, semantic web}, pages = {92--99}, annote = {Discusses a program that translates bug patterns into {SWRL} {(Semantic} Web Rule Language) rules using an ontology tool, Prot\'{e}g\'{e}. An ontology model of Java program is created according to Java program specification using Prot\'{e}g\'{e} as well. Both {SWRL} rules and the program ontology model are exported in {OWL} {(Web} Ontology Language) format. Second, Java source code under analysis is parsed into the Abstract Syntax Tree {(AST)}, which is automatically mapped to the individuals of the program ontology model. {SWRL} Bridge takes in the exported {OWL} file (representing the {SWRL} rules model and program ontology model) and the individuals created for the Java code, conduits to Jess (a rule engine), and obtains inference results indicating any bugs. Relevance: 5 } }, @inproceedings{henkel_catchup!_2005, title = {{CatchUp!}}, isbn = {1595939632}, url = {http://dl.acm.org/citation.cfm?id=1062512}, doi = {10.1145/1062455.1062512}, abstract = {Library developers who have to evolve a library to accommodate changing requirements often face a dilemma: Either they implement a clean, efficient solution but risk breaking client code, or they maintain compatibility with client code, but pay with increased design complexity and thus higher maintenance costs over {time.We} address this dilemma by presenting a lightweight approach for evolving application programming interfaces {(APIs)}, which does not depend on version control or configuration management systems. Instead, we capture {API} refactoring actions as a developer evolves an {API.} Users of the {API} can then replay the refactorings to bring their client software components up to {date.We} present catchup!, an implementation of our approach that captures and replays refactoring actions within an integrated development environment semi-automatically. Our experiments suggest that our approach could be valuable in practice.}, booktitle = {{ICSE} '05 Proceedings of the 27th international conference on Software engineering}, publisher = {{ACM} Press}, author = {Henkel, Johannes and Diwan, Amer}, year = {2005}, pages = {274 -- 283} }, @inproceedings{bonja_metrics_2006, address = {Melbourne, Florida}, title = {Metrics for class cohesion and similarity between methods}, isbn = {1-59593-315-8}, url = {http://portal.acm.org/citation.cfm?id=1185469}, doi = {10.1145/1185448.1185469}, abstract = {Class cohesion is one of the desirable properties in object oriented designs. But, designers and managers need a good metric for this property to help them evaluate, compare and choose among various possible solutions to a given problem. In this paper, we will present a new metric for class cohesion based on similarity between the methods of a class. Cohesion metrics developed in previous researches have made vital contributions in the area; however, they are subject to some criticisms. This research is based on the same theoretical foundations and addresses some of the drawbacks in previous works. The theory behind similarity of methods is discussed, and then the notion is used to develop a metric for the degree of similarity between a pair of methods in a class. This metric will be extended further to develop a new metric for class cohesion. Next, the developed cohesion metric is evaluated with Weyuker's set of properties for measurement followed by use of the metrics with some empirical data from a small application developed for student class project along with data collection tool for computing the metrics. Our empirical validation deals with specific examples of classes to show that the new metric is more powerful than {LCOM}, one of the basic metrics for lack of cohesion in class, and {CAMC}, another metric for class cohesion. Moreover, it shows how the new metric reflects the intuition of class cohesion and discriminates against classes for which these metrics consider equally cohesive or non-cohesive.}, booktitle = {Proceedings of the 44th Annual Southeast Regional Conference}, publisher = {{ACM}}, author = {Bonja, Challa and Kidanmariam, Eyob}, year = {2006}, keywords = {cohesion, complexity, metrics}, pages = {91--95} }, @phdthesis{anslow_evaluating_2008, type = {Master's thesis}, title = {Evaluating Extensible {3D} {(X3D)} Graphics For Use in Software Visualisation}, school = {Victoria University of Wellington}, author = {Anslow, Craig}, year = {2008}, keywords = {visualization} }, @article{fenton_software_1994, title = {Software Measurement: A Necessary Scientific Basis}, volume = {20}, issn = {0098-5589}, shorttitle = {Software Measurement}, doi = {http://doi.ieeecomputersociety.org/10.1109/32.268921}, abstract = {Software measurement, like measurement in any other discipline, must adhere to the science of measurement if it is to gain widespread acceptance and validity. The observation of some very simple, but fundamental, principles of measurement can have an extremely beneficial effect on the subject. Measurement theory is used to highlight both weaknesses and strengths of software metrics work, including work on metrics validation. We identify a problem with the well-known Weyuker properties {(E.J.} Weyuker, 1988), but also show that a criticism of these properties by {J.C.} Cherniavsky and {C.H.} Smith (1991) is invalid. We show that the search for general software complexity measures is doomed to failure. However, the theory does help us to define and validate measures of specific complexity attributes. Above all, we are able to view software measurement in a very wide perspective, rationalising and relating its many diverse activities.}, number = {3}, journal = {{IEEE} Transactions on Software Engineering}, author = {Fenton, N.}, year = {1994}, keywords = {complexity, measurement theory, metrics, metrics validation}, pages = {199--206} }, @article{prim_shortest_1957, title = {Shortest connection networks and some generalizations}, volume = {36}, number = {6}, journal = {Bell System Technical Journal}, author = {Prim, R. C}, year = {1957}, keywords = {graphs}, pages = {1389{\textendash}1401} }, @inproceedings{shtern_evaluating_2011, title = {Evaluating software clustering using multiple simulated authoritative decompositions}, isbn = {978-1-4577-0663-9}, doi = {10.1109/ICSM.2011.6080802}, abstract = {Evaluation of software clustering algorithms is typically done by comparing the clustering results to an authoritative decomposition prepared manually by a system expert. A well-known drawback of this approach is the fact that there are many, equally valid ways to decompose a software system, since different clustering objectives create different decompositions. Evaluating all clustering algorithms against a single authoritative decomposition can lead to biased results. In this paper, we introduce {LimSim}, a novel approach for software clustering evaluation that utilizes multiple simulated authoritative decompositions. We also present experimental results of applying the new approach to evaluate various software clustering algorithms. The results demonstrate the usefulness of {LimSim.}}, booktitle = {2011 27th {IEEE} International Conference on Software Maintenance {(ICSM)}}, publisher = {{IEEE}}, author = {Shtern, Mark and Tzerpos, Vassilios}, month = sep, year = {2011}, keywords = {Linux}, pages = {353--361} }, @book{fowler_refactoring_1999, address = {Boston}, title = {Refactoring : Improving the Design of Existing Code}, isbn = {0201485672, 9780201485677}, lccn = {{QA76.76} R42 F787 R}, publisher = {{Addison-Wesley}}, author = {Fowler, Martin and Beck, Kent and Brant, John and Opdyke, William and Roberts, Don}, year = {1999}, keywords = {refactoring, smells}, annote = {http://victoria.lconz.ac.nz.voyager-myvicsso. {vuw.ac.nz/cgi-bin/Pwebrecon.cgi?BBID=999504}} }, @article{alshayeb_empirical_2009, title = {Empirical investigation of refactoring effect on software quality}, issn = {09505849}, url = {http://dx.doi.org/10.1016/j.infsof.2009.04.002}, doi = {10.1016/j.infsof.2009.04.002}, abstract = {Developers and designers always strive for quality software. Quality software tends to be robust, reliable and easy to maintain, and thus reduces the cost of software development and maintenance. Several methods have been applied to improve software quality. Refactoring is one of those methods. The goal of this paper is to validate/invalidate the claims that refactoring improves software quality. We focused this study on different external quality attributes, which are adaptability, maintainability, understandability, reusability, and testability. We found that refactoring does not necessarily improve these quality attributes.}, journal = {Information and Software Technology}, author = {Alshayeb, Mohammad}, month = apr, year = {2009}, keywords = {empirical, maintainability, refactoring} }, @inproceedings{mansurov_knowledge_2005, address = {Alexandria, {VA} {USA}}, title = {Knowledge Discovery Meta-model: Tutorial}, author = {Mansurov, Nikolai}, month = oct, year = {2005}, keywords = {kbs, ontologies}, annote = {Gives some detail on the {KDM} via class diagrams. The Code and Action packages are particularly relevant. It's unclear whether there is enough detail in the model to enable fine-grained representation of a program sufficient for refactoring. Relevance: 4 } }, @phdthesis{maentylae_bad_2003, type = {Master's thesis}, title = {Bad smells in software{\textendash}a taxonomy and an empirical study}, school = {Helsinki University of Technology}, author = {M\"{a}ntyl\"{a}, M.}, month = may, year = {2003}, keywords = {empirical, metrics, metrics validation, smells} }, @phdthesis{opdyke_refactoring:_1992, type = {{PhD} Thesis}, title = {Refactoring: a program restructuring aid in designing object-oriented application frameworks}, school = {University of Illinois at {Urbana-Champaign}}, author = {Opdyke, William}, month = may, year = {1992}, keywords = {refactoring, thesis}, annote = {The origin of "refactoring". } }, @inproceedings{mikhajlov_study_1998, title = {A Study of The Fragile Base Class Problem}, abstract = {In this paper we study the fragile base class problem. This problem occurs in open object-oriented systems employing code inheritance as an implementation reuse mechanism. System developers unaware of extensions to the system developed by its users may produce a seemingly acceptable revision of a base class whichmay damage its extensions. The fragile base class problem becomes apparent during maintenance of open object-oriented systems, but requires consideration during design. We express the fragile base class problem in terms of a flexibility property. By means of five orthogonal examples, violating the flexibility property, we demonstrate different aspects of the problem. We formulate requirements for disciplining inheritance, and extend the refinement calculus to accommodate for classes, objects, class-based inheritance, and class refinement. We formulate and formally prove a flexibility theorem demonstrating that the restrictions we impose on inheritance are sufficient to permit safe substitution of a base class with its revision in presence of extension classes.}, booktitle = {Proceedings of the 12th European Conference on {Object-Oriented} Programming}, publisher = {{Springer-Verlag} London, {UK}}, author = {Mikhajlov, Leonid and Sekerinski, Emil}, year = {1998}, keywords = {coupling, inheritance, {OOD}, software reuse}, pages = {355--382} }, @inproceedings{whittle_constraint-based_2011, series = {Lecture Notes in Computer Science}, title = {Constraint-based model refactoring}, volume = {6981}, isbn = {978-3-642-24484-1}, location = {Heidelberg}, abstract = {The {UML} standard specifies well-formedness rules as constraints on {UML} models. To be correct, refactoring of a model must take these constraints into account and check that they are still satisfied after a refactoring has been performed {\textemdash} if not, the refactoring must be refused. With constraint-based refactoring, constraint checking is replaced by constraint solving, lifting the role of constraints from permitting or denying a tentative refactoring to computing additional model changes required for the refactoring to be executable. Thus, to the degree that the semantics of a modelling language is specified using constraints, refactorings based on these constraints are guaranteed to be meaning preserving. To enable the reuse of pre-existing constraints for refactoring, we present a mapping from well-formedness rules as provided by the {UML} standard to constraint rules as required by constraint-based refactoring. Using these mappings, models can be refactored at no extra cost; if refactorings fail, the lack of meaning preservation points us to how the constraint-based semantic specifications of the modelling language can be improved.}, booktitle = {Model Driven Engineering Languages and Systems}, publisher = {Springer}, author = {Steimann, Friedrich}, editor = {Whittle, Jon and Clark, Tony and K\"{u}hne, Thomas}, year = {2011}, keywords = {constraint satisfaction, refactoring, {UML}}, pages = {440{\textendash}454} }, @article{etzkorn_comparison_2004, title = {A comparison of cohesion metrics for object-oriented systems}, volume = {46}, issn = {0950-5849}, url = {http://www.sciencedirect.com/science/article/B6V0B-4BJ261T-2/2/809f6b83ba06f37cda6ea73c5be4912b}, doi = {10.1016/j.infsof.2003.12.002}, abstract = {Cohesion is the degree to which the elements of a class or object belong together. Many different object-oriented cohesion metrics have been developed; many of them are based on the notion of degree of similarity of methods. No consensus has yet arisen as to which of these metrics best measures cohesion; this is a problem for software developers since there are so many suggested metrics, it is difficult to make an informed choice. This research compares various cohesion metrics with ratings of two separate teams of experts over two software packages, to determine which of these metrics best match human-oriented views of cohesion. Additionally, the metrics are compared statistically, to determine which tend to measure the same kinds of cohesion. Differences in results for different object-oriented metrics tools are discussed.}, number = {10}, journal = {Information and Software Technology}, author = {Etzkorn, Letha H. and Gholston, Sampson E. and Fortune, Julie L. and Stein, Cara E. and Utley, Dawn and Farrington, Phillip A. and Cox, Glenn W.}, month = aug, year = {2004}, keywords = {cohesion, comparative study, empirical, metrics, user studies}, pages = {677--687}, annote = {Discusses different cohesion measures {(LCOM} {LCOM1} {LCOM2} {LCOM3} {LCOM4} {LCOM5} Coh {LCC} {TCC} {PLCOM1} {PLCOM3)} and points out sources of confusion in the names and what they measure. They also found that different tools gave different results for what was supposedly the same metric. Of the metrics studied, {LCC} was the winner in terms of how the results matched those expected by the human test group. While it discusses {CBMC}, it didn't include {CBMC} in the results. They couldn't get it to work. It doesn't address {ICBMC.} Relevance: 4} }, @inproceedings{kim_empirical_2005, address = {New York, {NY}, {USA}}, title = {An empirical study of code clone genealogies}, isbn = {1-59593-014-0}, location = {Lisbon, Portugal}, doi = {http://doi.acm.org/10.1145/1081706.1081737}, booktitle = {{ESEC/FSE-13:} Proceedings of the 10th European software engineering conference held jointly with 13th {ACM} {SIGSOFT} international symposium on Foundations of software engineering}, publisher = {{ACM}}, author = {Kim, Miryung and Sazawal, Vibha and Notkin, David and Murphy, Gail}, year = {2005}, keywords = {clones, empirical}, pages = {187{\textemdash}196}, annote = {Investigates the evolution of clones in a software system. Concludes that many efforts to refactor are either unnecessary or undoable, due to inadequacies of the programming language. Relevance: 3 } }, @techreport{omadadhain_jung_2003, type = {Technical Report}, title = {The {JUNG} {(Java} Universal {Network/Graph)} Framework}, number = {{UCI-ICS} 03-17}, institution = {School of Information and Computer Science, University of California, Irvine}, author = {{O'Madadhain}, Joshua and Fisher, Danyel and White, Scott and Boey, {Yan-Biao}}, year = {2003}, keywords = {graph algorithms, graph layout, graphs} }, @inproceedings{fung_hierarchical_2003, title = {Hierarchical Document Clustering Using Frequent Itemsets}, volume = {3}, abstract = {A major challenge in document clustering is the extremely high dimensionality. For example, the vocabulary for a document set can easily be thousands of words. On the other hand, each document often contains a small fraction of words in the vocabulary. These features require special handlings. Another requirement is hierarchical clustering where clustered documents can be browsed according to the increasing specificity of topics. In this paper, we propose to use the notion of frequent itemsets, which comes from association rule mining, for document clustering. The intuition of our clustering criterion is that each cluster is identified by some common words, called frequent itemsets, for the documents in the cluster. Frequent itemsets are also used to produce a hierarchical topic tree for clusters. By focusing on frequent items, the dimensionality of the document set is drastically reduced. We show that this method outperforms best existing methods in terms of both clustering accuracy and scalability}, booktitle = {Proceedings of the Third {SIAM} International Conference on Data Mining}, author = {Fung, {B.C.M} and Wang, K. and Ester, M.}, year = {2003}, pages = {59} }, @article{boehm_software_1984, title = {Software Engineering Economics}, volume = {{SE-10}}, issn = {0098-5589}, doi = {10.1109/TSE.1984.5010193}, abstract = {This paper summarizes the current state of the art and recent trends in software engineering economics. It provides an overview of economic analysis techniques and their applicability to software engineering and management. It surveys the field of software cost estimation, including the major estimation techniques available, the state of the art in algorithmic cost models, and the outstanding research issues in software cost estimation.}, number = {1}, journal = {{IEEE} Transactions on Software Engineering}, author = {Boehm, Barry W}, month = jan, year = {1984}, keywords = {cost models, Costs, software cost estimation, software management}, pages = {4--21} }, @inproceedings{meyers_slice-based_2004, title = {{Slice-Based} Cohesion Metrics and Software Intervention}, isbn = {0-7695-2243-2}, url = {http://portal.acm.org/citation.cfm?id=1039057}, abstract = {Software reconstruction is a costly endeavor, due in part to the ambiguity of where to focus reengineering effort. Cohesion metrics, and particularly quantitative cohesion metrics, have the potential to aid in this identification and to measure progress. The most extensive work on such metrics is with slice-based cohesion metrics. While their use of semantic dependence information should make them an excellent choice for cohesion measurement, their wide spread use has been impeded by a lack of empirical study. Recent advances in software tools make, for the first time, a large-scale empirical study of slice-based cohesion metrics possible. Three results from such a study are presented. First, base-line values for slice-based metrics are provided. These values act as targets for reengineering efforts with modules having values outside the expected range being the most in need of attention. Second, two longitudinal studies show that slice-based metrics quantify the deterioration of a program as it ages. This serves to validate the metrics: the metrics quantify the degradation that exists during development; turning this around, the metrics can be used to measure the progress of a reengineering effort. Finally, "head-to-head" qualitative and quantitative comparisons of the metrics identify which metrics provide similar views of a program and which provide unique views of a program.}, booktitle = {Proceedings of the 11th Working Conference on Reverse Engineering}, publisher = {{IEEE} Computer Society}, author = {Meyers, Timothy M. and Binkley, David}, year = {2004}, keywords = {aspects, cohesion, metrics}, pages = {256--265} }, @book{pach_graph_2005, address = {Berlin / Heidelberg}, series = {Lecture Notes in Computer Science}, title = {Graph Drawing}, volume = {3383/2005}, isbn = {978-3-540-24528-5}, url = {http://dx.doi.org/10.1007/b105810}, publisher = {Springer}, author = {Pach, J\'{a}nos}, year = {2005}, keywords = {graph layout, graphs} }, @inproceedings{murphy-hill_seven_2008, address = {Atlanta, Georgia}, title = {Seven habits of a highly effective smell detector}, isbn = {978-1-60558-228-3}, url = {http://portal.acm.org/citation.cfm?id=1454261}, doi = {10.1145/1454247.1454261}, abstract = {The process of refactoring code---changing its structure while preserving its meaning---has been identified as an important way of maintaining code quality over time. However, it is sometimes difficult for progammers to identify which pieces of code are in need of refactoring. {"Smell} detectors" are designed to help programmers in this task, but most smell detectors do not mesh well with "floss refactoring," the recommended tactic in which refactoring and programming are finely interleaved. In this paper we present a smell detector that we have built with floss refactoring in mind by combining seven habits that we postulate are important to consider when designing usable smell detectors. We hope that this combination can help the designers of future smell detectors build tools that align with the way that programmers refactor.}, booktitle = {Proceedings of the 2008 International Workshop on Recommendation Systems for Software Engineering}, publisher = {{ACM}}, author = {{Murphy-Hill}, Emerson and Black, Andrew P.}, year = {2008}, keywords = {refactoring, smells, tools}, pages = {36--40} }, @article{omadadhain_analysis_2005, title = {Analysis and visualization of network data using {JUNG}}, volume = {10}, journal = {Journal of Statistical Software}, author = {{O{\textquoteright}Madadhain}, J. and Fisher, D. and Smyth, P. and White, S. and Boey, Y. B}, year = {2005}, keywords = {graph algorithms, graph layout, graphs}, pages = {1{\textendash}35} }, @inproceedings{kessentini_deviance_2010, address = {Antwerp, Belgium}, title = {Deviance from perfection is a better criterion than closeness to evil when identifying risky code}, url = {http://portal.acm.org/citation.cfm?id=1858996.1859015}, doi = {10.1145/1858996.1859015}, abstract = {We propose an approach for the automatic detection of potential design defects in code. The detection is based on the notion that the more code deviates from good practices, the more likely it is bad. Taking inspiration from artificial immune systems, we generated a set of detectors that characterize different ways that a code can diverge from good practices. We then used these detectors to measure how far code in assessed systems deviates from normality. We evaluated our approach by finding potential defects in two open-source systems {(Xerces-J} and Gantt). We used the library {JHotDraw} as the code base representing good design/programming practices. In both systems, we found that 90\% of the riskiest classes were defects, a precision far superiour to state of the art rule-based approaches.}, booktitle = {Proceedings of the {IEEE/ACM} International Conference on Automated Software Engineering - {ASE} '10}, author = {Kessentini, Marouane and Vaucher, St\'{e}phane and Sahraoui, Houari}, year = {2010}, keywords = {maintenance}, pages = {113} }, @inproceedings{savage_topicxp:_2010, address = {Timi\c{s}oara, Romania}, title = {{TopicXP:} exploring topics in source code using Latent Dirichlet Allocation}, shorttitle = {{TopicXP}}, abstract = {Acquiring general understanding of large software systems and components from which they are built can be a time consuming task, but having such an understanding is an important prerequisite to adding features or fixing bugs. In this paper we propose the tool, namely {TopicXP}, to support developers during such software maintenance tasks by extracting and analyzing unstructured information in source code identifier names and comments using Latent Dirichlet Allocation. {TopicXP} enables developers to gain an overview of a software system under analysis by extracting and visualizing natural language topics, which generally correspond to concepts or features implemented in software classes. {TopicXP} is implemented as an open-source Eclipse plug-in, which proposes interactive visualization of topics along with structural dependencies between underlying classes implementing these topics. The paper also presents the results of a preliminary user study aimed at evaluating {TopicXP.}}, booktitle = {Proc. of 26th {IEEE} International Conference on Software Maintenance}, author = {Savage, T. and Dit, B. and Gethers, M. and Poshyvanyk, D.}, month = sep, year = {2010}, keywords = {cohesion, Eclipse, semantics, visualization}, annote = {Describes an Eclipse plug-in that uses semantic analysis of identifiers and comments to recognize topics in source code. It also calculates a conceptual cohesion metric - {MWE.} See {http://www.cs.wm.edu/semeru/TopicXP/}} }, @article{tsantalis_identification_2011, title = {Identification of Extract Method Refactoring}, volume = {7}, number = {1}, journal = {{ACM} Transactions on Programming Languages and Systems}, author = {Tsantalis, N. and Chatzigeorgiou, A.}, year = {2011}, pages = {37{\textendash}61} }, @article{myers_software_2003, title = {Software systems as complex networks: Structure, function, and evolvability of software collaboration graphs}, volume = {E 68}, doi = {10.1103/PhysRevE.68.046116}, abstract = {Software systems emerge from mere keystrokes to form intricate functional networks connecting many collaborating modules, objects, classes, methods, and subroutines. Building on recent advances in the study of complex networks, I have examined software collaboration graphs contained within several open-source software systems, and have found them to reveal scale-free, small-world networks similar to those identified in other technological, sociological, and biological systems. I present several measures of these network topolo- gies, and discuss their relationship to software engineering practices. I also present a simple model of software system evolution based on refactoring processes which captures some of the salient features of the observed systems. Some implications of object-oriented design for questions about network robustness, evolvability, degeneracy, and organization are discussed in the wake of these findings.}, journal = {Physical Review}, author = {Myers, Christopher}, month = oct, year = {2003}, keywords = {design patterns, refactoring, {SNA}}, annote = {This is a broad-ranging paper that considers software systems relative to other emergent systems, including biological and social systems. It discussing refactoring, evolution, intelligent design, and design patterns. Relevance: 5} }, @inproceedings{schaefer_challenge_2008, address = {Savannah, {GA}, {USA}}, title = {Challenge proposal: verification of refactorings}, isbn = {978-1-60558-330-3}, shorttitle = {Challenge proposal}, url = {http://portal.acm.org/citation.cfm?id=1481859}, doi = {10.1145/1481848.1481859}, abstract = {Automated refactoring tools are an essential part of a software developer's toolbox. They are most useful for gradually improving large existing code bases and it is essential that they work reliably, since even a simple refactoring may affect many different parts of a program, and the programmer should not have to inspect every individual change to ensure that the transformation went as expected. Even extensively tested industrial-strength refactoring engines, however, are fraught with many bugs that lead to incorrect, non-behaviour preserving transformations. We argue that software refactoring tools are a prime candidate for mechanical verification, offering significant challenges but also the prospect of tangible benefits for real-world software development.}, booktitle = {Proceedings of the 3rd workshop on Programming languages meets program verification}, publisher = {{ACM}}, author = {Sch\"{a}fer, Max and Ekman, Torbj\"{o}rn and Moor, Oege de}, year = {2008}, keywords = {refactoring}, pages = {67--72} }, @inproceedings{du_bois_refactoring_2004, title = {Refactoring -- improving coupling and cohesion of existing code}, isbn = {1095-1350}, doi = {10.1109/WCRE.2004.33}, abstract = {Refactorings are widely recognised as ways to improve the internal structure of object-oriented software while maintaining its external behaviour. Unfortunately, refactorings concentrate on the treatment of symptoms (the so called code-smells), thus improvements depend a lot on the skills of the maintained coupling and cohesion on the other hand are quality attributes which are generally recognized as being among the most likely quantifiable indicators for software maintainability. Therefore, this paper analyzes how refactorings manipulate coupling/cohesion characteristics, and how to identify refactoring opportunities that improve these characteristics. As such we provide practical guidelines for the optimal usage of refactoring in a software maintenance process.}, booktitle = {Proceedings 11th Working Conference on Reverse Engineering}, author = {Du Bois, B. and Demeyer, S. and Verelst, J.}, year = {2004}, keywords = {cohesion, coupling, maintenance, metrics, refactoring, smells}, pages = {144--151}, annote = {Has a (+0-) table that shows the qualitative effects of various refactorings on coupling and cohesion measurements. Surprise, many refactorings could make things better or worse. Relevance: 3} }, @article{perez_case_2010, title = {A case study to evaluate the suitability of graph transformation tools for program refactoring}, volume = {12}, url = {http://dx.doi.org/10.1007/s10009-010-0153-y}, doi = {10.1007/s10009-010-0153-y}, abstract = {This article proposes a case study to evaluate the suitability of graph transformation tools for program refactoring. To qualify for this purpose, a graph transformation system must be able to (1) import a graph-based representation of models of Java programs, (2) allow these models to be transformed interactively with well-known program refactorings and (3) export the resulting models in the same graph-based format used as input. The case study aims to enable comparison of various features of graph transformation tools, such as their expressiveness and their ability to interact with the user. The model of Java programs is presented and some examples for translating Java source code into the model are provided. The refactorings selected for the case study are specified in detail.}, number = {3}, journal = {International Journal on Software Tools for Technology Transfer {(STTT)}}, author = {P\'{e}rez, Javier and Crespo, Yania and Hoffmann, Berthold and Mens, Tom}, month = jul, year = {2010}, keywords = {graph algorithms, graphs, refactoring, transformation}, pages = {183--199} }, @inproceedings{coskun_applying_2011, address = {Ljubljana, Slovenia}, title = {Applying Community Detection Algorithms on Ontologies for Indentifying Concept Groups}, author = {Coskun, G\"{o}khan and Rothe, M. and Teymourian, K. and Paschke, A.}, month = aug, year = {2011} }, @article{li_object-oriented_1993, title = {Object-oriented metrics that predict maintainability}, volume = {23}, url = {http://portal.acm.org/citation.cfm?id=170622}, abstract = {Software metrics have been studied in the procedural paradigm as a quantitative means of assessing the software development process as well as the quality of software products. Several studies have validated that various metrics are useful indicators of maintenance effort in the procedural paradigm. However, software metrics have rarely been studied in the object oriented paradigm. Very few metrics have been proposed to measure object oriented systems, and the proposed ones have not been validated. This research concentrates on several object oriented software metrics and the validation of these metrics with maintenance effort in two commercial systems. Statistical analyses of a prediction model incorporating ten metrics are performed. In addition, a more compact model with fewer metrics was sought, analyses performed, and also presented.}, number = {2}, journal = {J. Syst. Softw.}, author = {Li, Wei and Henry, Sallie}, year = {1993}, keywords = {cohesion, key generation algorithm, maintainability, metrics}, pages = {111--122} }, @inproceedings{mueller_composing_1990, address = {San Diego, {CA} , {USA}}, title = {Composing Subsystem Structures using (k,2)-partite Graphs}, url = {http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.32.71}, abstract = {Subsystem composition is the process of constructing composite software components out of building blocks such as variables, procedures, modules, and subsystems. Hierarchical subsystem structures are formed by imposing equivalence relations on the resource-flow graphs of the source code. Composition algorithms often use a single equivalence relation (e.g., connection strength or data binding measure) to automatically form tree-shaped composite structures. This paper describes a clustering algorithm that uses four equivalence relations for identifying subsystem structures. The resulting compositions are (k,2)-partite graphs (a class of layered graphs) rather than strict tree hierarchies. The algorithm is an integral part of our interactive graph editor.}, booktitle = {Proceedings of the Conference on Software Maintenance, 1990}, author = {M\"{u}ller, Hausi A and Uhl, James S}, year = {1990}, keywords = {clustering, graphs, software clustering, software remodularization}, pages = {12---19}, annote = {Includes ideas of filtering/restructuring graphs, e.g. the removal of omnipresent nodes.} }, @inproceedings{ossher_sourcererdb:_2009, address = {Los Alamitos, {CA}, {USA}}, title = {{SourcererDB:} An aggregated repository of statically analyzed and cross-linked open source Java projects}, isbn = {978-1-4244-3493-0}, shorttitle = {{SourcererDB}}, doi = {http://doi.ieeecomputersociety.org/10.1109/MSR.2009.5069501}, abstract = {Abstract The open source movement has made vast quantities of source code available online for free, providing an extremely large dataset for empirical study and potential resuse. A major difficulty in exploiting this potential fully is that the data are currently scattered between competing source code repositories, none of which are structured for empirical analysis and cross-project comparison. As a result, software researchers and developers are left to compile their own datasets, resulting in duplicated effort and limited results. To address this challenge, we built {SourcererDB}, an aggregated repository of statically analyzed and cross-linked open source Java projects. {SourcererDB} contains local snapshots of 2,852 Java projects taken from Sourceforge, Apache and Java.net. These projects are statically analyzed to extract rich structural information, which is then stored in a relational database. References to entities in the 16,058 external jars are resolved and grouped, allowing for cross-project usage information to be accessed easily. This paper describes: (a) the mechanism for resolving and grouping these cross-project references, (b) the structure of and the metamodel for the {SourcererDB} repository, and (d) end-user dataset access mechanisms. Our goal in building {SourcererDB} is to provide a rich dataset of source code to facilitate the sharing of extracted data and to encourage reuse and repeatability of experiments.}, booktitle = {Mining Software Repositories, International Workshop on}, publisher = {{IEEE} Computer Society}, author = {Ossher, Joel and Bajracharya, Sushil and Linstead, Erik and Baldi, Pierre and Lopes, Cristina}, year = {2009}, keywords = {empirical, metrics}, pages = {183--186} }, @article{smart_pmd_2006, title = {{PMD} Squashes Code Bugs}, url = {http://www.devx.com/java/Article/31286/0/page/1}, journal = {{DevX}}, author = {Smart, John}, month = apr, year = {2006}, keywords = {tools} }, @phdthesis{dudziak_tool-supported_2002, type = {Master's thesis}, title = {Tool-supported discovery and refactoring of structural weaknesses in code}, abstract = {Software systems are changed throughout their entire life, e.g. to adapt it to changed environments or to incorporate new features. Refactoring deals with restructuring a system in such a way that these changes can be implemented without problems. Manual refactoring however is costly and error prone so that tools are preferred which handle certain tasks in a fast and reliable way. We present an approach for creating a tool that supports the developer in three important areas: 1. The tool is able to detect structural weaknesses automatically, and to report them to the developer. A tool can perform this task much faster and more thorough, even though only a limited range of problems in the structure are detectable by tools. 2. In addition, it can determine for certain problems which kinds of restructuring would provide the most benefit. This type of analysis is especially difficult for developers. 3. For certain kinds of restructuring it can assess whether they are applicable, and it then can perform them in an automated fashion. Especially restructuring is particularly suited for automatization. We use simple program representations (abstract syntax tree) and commonplace static analysis in order to implement a prototypical add-in for the {NetBeans} {IDE.}}, school = {Technical University of Berlin}, author = {Dudziak, Thomas and Wloka, Jan}, month = feb, year = {2002}, keywords = {refactoring} }, @article{frenzel_language_2006, title = {The Language Toolkit: An {API} for Automated Refactorings in Eclipse-based {IDEs}}, volume = {5}, url = {http://www.eclipse.org/articles/Article-LTK/ltk.html}, journal = {Eclipse Magazin}, author = {Frenzel, Leif}, month = jan, year = {2006}, keywords = {Eclipse, refactoring}, annote = {Describes some of the specific Eclipse refactoring {APIs.} Relevance: 5 } }, @article{wang_dmc:_2005, title = {{DMC:} a more precise cohesion measure for classes}, volume = {47}, issn = {0950-5849}, shorttitle = {{DMC}}, url = {http://www.sciencedirect.com.helicon.vuw.ac.nz/science/article/B6V0B-4D9DFH0-1/2/34347054ee260d7e8874a052878ca06b}, doi = {10.1016/j.infsof.2004.07.001}, abstract = {In object-oriented systems, a single class consists of attributes and methods and its cohesion denotes the degree of relatedness among these elements. To quantify the cohesiveness of a class, a large number of measures that only depict method-attribute reference relationships have been proposed in last decade. However, the flow-dependence relationships among attributes, the direction of method-attribute references, and the potential dependence relationships among the elements in the class are ignored. To address this problem, this paper first depicts four types of explicit dependence relationships and uses a class element dependence graph to represent all dependencies among the elements in a class. Then, a dependence matrix that reflects the degree of direct dependence and indirect dependence among the elements in a class is computed. Finally, a more precise cohesion measure for classes is proposed.}, number = {3}, journal = {Information and Software Technology}, author = {Wang, Jianmin and Zhou, Yuming and Wen, Lijie and Chen, Yujian and Lu, Hongmin and Xu, Baowen}, month = mar, year = {2005}, keywords = {cohesion}, pages = {167--180} }, @article{steimann_infer_2007, title = {The Infer Type refactoring and its use for interface-based programming}, volume = {6}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.100.6825}, abstract = {Interface-based programming, i.e. the systematic use of interface types in variable declarations, serves the decoupling of classes and increases a program{\textquoteright}s changeability. To maximize this effect, interfaces should contain as few elements as possible. For the design of minimal (i.e., maximally general) interfaces, an in-depth analysis of the protocol needed from objects in a given context is required. However, currently available refactorings for the creation of such interfaces (such as Extract Interface) leave programmers alone with the decision what to include or, more importantly, what to omit: they let them choose manually from the protocol of a class, and only then offer the use of the new interface where (if) possible. To end this trial and error process, we have developed a new refactoring named Infer Type that, using type inference, completely automates the construction of new, context-specific interfaces and their use in variable declarations, thus supporting greater decoupling and access protection of code, serving the goals of interface-based programming.}, number = {2}, journal = {Journal Of Object Technology}, author = {Steimann, Friedrich}, year = {2007}, keywords = {coupling, interfaces, refactoring}, pages = {67--89} }, @article{budanitsky_evaluating_2006, title = {Evaluating {WordNet-based} Measures of Lexical Semantic Relatedness}, volume = {32}, issn = {0891-2017}, url = {http://dx.doi.org/10.1162/coli.2006.32.1.13}, doi = {i: 10.1162/coli.2006.32.1.13

}, abstract = {The quantification of lexical semantic relatedness has many applications in {NLP}, and many different measures have been proposed. We evaluate five of these measures, all of which use {WordNet} as their central resource, by comparing their performance in detecting and correcting real-word spelling errors. An information-content-based measure proposed by Jiang and Conrath is found superior to those proposed by Hirst and {St-Onge}, Leacock and Chodorow, Lin, and Resnik. In addition, we explain why distributional similarity is not an adequate proxy for lexical semantic relatedness.}, number = {1}, journal = {Computational Linguistics}, author = {Budanitsky, Alexander and Hirst, Graeme}, year = {2006}, pages = {13--47} }, @inproceedings{khomh_bayesian_2009, address = {Los Alamitos, {CA}, {USA}}, title = {A Bayesian Approach for the Detection of Code and Design Smells}, doi = {http://doi.ieeecomputersociety.org/10.1109/QSIC.2009.47}, abstract = {The presence of code and design smells can have a severe impact on the quality of a program. Consequently, their detection and correction have drawn the attention of both researchers and practitioners who have proposed various approaches to detect code and design smells in programs. However, none of these approaches handle the inherent uncertainty of the detection process. We propose a Bayesian approach to manage this uncertainty. First, we present a systematic process to convert existing state-of-the-art detection rules into a probabilistic model. We illustrate this process by generating a model to detect occurrences of the Blob antipattern. Second, we present results of the validation of the model: we built this model on two open-source programs, {GanttProject} v1.10.2 and Xerces v2.7.0, and measured its accuracy. Third, we compare our model with another approach to show that it returns the same candidate classes while ordering them to minimise the quality analysts' effort. Finally, we show that when past detection results are available, our model can be calibrated using machine learning techniques to offer an improved, context-specific detection.}, booktitle = {Quality Software, International Conference on}, publisher = {{IEEE} Computer Society}, author = {Khomh, Foutse and Vaucher, St\'{e}phane and Gu\'{e}h\'{e}neuc, {Yann-Ga\"{e}l} and Sahraoui, Houari}, year = {2009}, keywords = {bayesian belief networks, design smells, smells, software quality}, pages = {305--314} }, @article{dietrich_detection_2010, title = {On the Detection of {High-Impact} Refactoring Opportunities in Programs}, volume = {abs/1006.1747}, abstract = {We present a novel approach to detect refactoring opportunities by measuring the participation of references between types in instances of patterns representing design flaws. This technique is validated using an experiment where we analyse a set of 95 open- source Java programs for instances of four patterns representing modularisation problems. It turns out that our algorithm can detect high impact refactorings opportunities - a small number of references such that the removal of those references removes the majority of patterns from the program.}, journal = {{CoRR}}, author = {Dietrich, Jens and {McCartin}, Catherine and Tempero, Ewan D. and Shah, Syed M. Ali}, year = {2010} }, @inproceedings{abadi_re-approaching_2008, address = {Nashville, Tennessee, {USA}}, title = {Re-approaching the refactoring Rubicon}, abstract = {Fowler saw the availability of automated support for the Extract Method refactoring in modern {IDEs} as an indication for the crossing of the refactoring Rubicon. In spite of the advances in refactoring technology, it seems that this Rubicon has not yet been crossed, and refactoring support in modern {IDEs} leave a lot to be desired. We performed a case study in which we converted a Java servlet to use the model-view-controller pattern, using as much automated support as available. We found that while the whole conversion could be described as a series of refactorings, most of these were inadequately supported by the {IDE}, and some were not supported at all. Based on these findings, we outline the requirements from a refactoring framework that will support much more of the conversion process, and will also enable the composition of small refactorings into larger ones.}, booktitle = {Proceedings of the Second {ACM} Workshop on Refactoring Tools}, author = {Abadi, Ahari and Ettinger, Ran and Feldman, Yishai}, month = oct, year = {2008}, keywords = {refactoring}, annote = {Discusses problems with the extract-method refactoring, many having to do with insufficient analysis. For example, assignment to multiple variables could be circumvented by swapping statement positions if the swapped statements did not interact. Relevance: 5} }, @inproceedings{shtern_refining_2009, address = {Los Alamitos, {CA}, {USA}}, title = {Refining clustering evaluation using structure indicators}, isbn = {978-1-4244-4897-5}, doi = {http://doi.ieeecomputersociety.org/10.1109/ICSM.2009.5306306}, abstract = {The evaluation of the effectiveness of software clustering algorithms is a challenging research question. Several approaches that compare clustering results to an authoritative decomposition have been presented in the literature. Existing evaluation methods typically compress the evaluation results into a single number. They also often disagree with each other for reasons that are not well understood. In this paper, we introduce a novel set of indicators that evaluate structural discrepancies between software decompositions. They also allow researchers to investigate the differences between existing evaluation approaches in a reduced search space. Several experiments with real software systems showcase the usefulness of the introduced indicators.}, booktitle = {{IEEE} International Conference on Software Maintenance}, publisher = {{IEEE} Computer Society}, author = {Shtern, Mark and Tzerpos, Vassilios}, year = {2009}, keywords = {cluster evaluation, clustering, software clustering}, pages = {297--305}, annote = {Contains a nice review of (software) cluster comparison techniques. It also has some nice examples that illustrate how to leading cluster comparison algorithms differ considerably. They introduce three "structure indicators" to help identify the different kinds of variations between two clusterings.} }, @inproceedings{muhammad_role_2010, title = {Role of relationships during clustering of object-oriented software systems}, doi = {10.1109/ICET.2010.5638477}, abstract = {Clustering has been applied by researchers for the architecture recovery of software systems. Clustering algorithms form clusters of similar entities, where similarity is determined by the characteristics of an entity or the relationships that exist between entities. Thus selecting appropriate relationships is important for improving cluster quality. As compared to structured systems, for which relationships have been evaluated, relatively little work has been done for object-oriented software systems to determine which relationships produce better clustering results. In this paper, we divide relationships within object-oriented systems into different categories and evaluate them. We conduct experiments on three test systems using well known hierarchical clustering algorithms. Our experimental results indicate the relationships that improve the quality of clustering results.}, booktitle = {Emerging Technologies {(ICET)}, 2010 6th International Conference on}, author = {Muhammad, S. and Maqbool, O. and Abbasi, {A.Q.}}, year = {2010}, keywords = {clustering, pattern clustering, software clustering}, pages = {270--275} }, @article{ko_exploratory_2006, title = {An Exploratory Study of How Developers Seek, Relate, and Collect Relevant Information during Software Maintenance Tasks}, volume = {32}, issn = {0098-5589}, doi = {http://doi.ieeecomputersociety.org/10.1109/TSE.2006.116}, abstract = {Much of software developers' time is spent understanding unfamiliar code. To better understand how developers gain this understanding and how software development environments might be involved, a study was performed in which developers were given an unfamiliar program and asked to work on two debugging tasks and three enhancement tasks for 70 minutes. The study found that developers interleaved three activities. They began by searching for relevant code both manually and using search tools; however, they based their searches on limited and misrepresentative cues in the code, environment, and executing program, often leading to failed searches. When developers found relevant code, they followed its incoming and outgoing dependencies, often returning to it and navigating its other dependencies; while doing so, however, Eclipse's navigational tools caused significant overhead. Developers collected code and other information that they believed would be necessary to edit, duplicate, or otherwise refer to later by encoding it in the interactive state of Eclipse's package explorer, file tabs, and scroll bars. However, developers lost track of relevant code as these interfaces were used for other tasks, and developers were forced to find it again. These issues caused developers to spend, on average, 35 percent of their time performing the mechanics of navigation within and between source files. These observations suggest a new model of program understanding grounded in theories of information foraging and suggest ideas for tools that help developers seek, relate, and collect information in a more effective and explicit manner.}, number = {12}, journal = {{IEEE} Transactions on Software Engineering}, author = {Ko, Andrew J. and Myers, Brad A. and Coblenz, Michael J. and Aung, Htet Htet}, year = {2006}, keywords = {empirical, program comprehension, program understanding}, pages = {971--987} }, @inproceedings{subramaniam_reengineering_1998, title = {Reengineering the class-an object oriented maintenance activity}, isbn = {0730-3157}, abstract = {When an Incremental Approach is used to develop an object-oriented system, there is a risk that the class design will deteriorate in quality with each increment. This paper presents a technique for detecting classes that may be prone to deteriorate, or if deterioration has occurred assists with reengineering those classes. Experience with applying this technique to an industrial software development project is also discussed}, booktitle = {Proceedings of the {Twenty-Second} Annual International Computer Software and Applications Conference}, author = {Subramaniam, {G.V.} and Byme, {E.J.}}, year = {1998}, keywords = {maintenance, metrics, {OOP}, reengineering}, pages = {39--44} }, @misc{university_of_manchester_tones_2008, title = {{TONES} Ontology Repository}, url = {http://owl.cs.manchester.ac.uk/repository/browser}, author = {University of Manchester}, year = {2008}, note = {Accessed 2011-12-15}, howpublished = {http://owl.cs.manchester.ac.uk/repository/browser} }, @article{chikofsky_reverse_1990, title = {Reverse engineering and design recovery: a taxonomy}, volume = {7}, issn = {0740-7459}, shorttitle = {Reverse engineering and design recovery}, doi = {10.1109/52.43044}, abstract = {The key to applying computer-aided software engineering to the maintenance and enhancement of existing systems lies in applying reverse-engineering approaches. However, there is considerable confusion over the terminology used in both technical and marketplace discussions. The authors define and relate six terms: forward engineering, reverse engineering, redocumentation, design recovery, restructuring, and reengineering. The objective is not to create new terms but to rationalize the terms already in use. The resulting definitions apply to the underlying engineering processes, regardless of the degree of automation applied}, number = {1}, journal = {Software, {IEEE}}, author = {Chikofsky, {E.J.} and Cross, {J.H.}}, year = {1990}, keywords = {design recovery, reengineering, restructuring, taxonomy}, pages = {13--17} }, @inproceedings{fokaefs_decomposing_2009, address = {Los Alamitos, {CA}, {USA}}, title = {Decomposing object-oriented class modules using an agglomerative clustering technique}, isbn = {978-1-4244-4897-5}, doi = {http://doi.ieeecomputersociety.org/10.1109/ICSM.2009.5306332}, abstract = {Software can be considered a live entity, as it undergoes many alterations throughout its lifecycle. Furthermore, developers do not usually retain a good design in favor of adding new features, comply with requirements or meet deadlines. For these reasons, code can become rather complex and difficult to understand. More particularly in object-oriented systems, classes may become very large and less cohesive. In order to identify such problematic cases, existing approaches have proposed the use of cohesion metrics. However, while metrics can identify classes with low cohesion, they cannot identify new or independent concepts. Moreover, these methods require a lot of human interpretation to identify the respective design flaws. In this paper, we propose a class decomposition method using an agglomerative clustering algorithm based on the Jaccard distance between class members. Our methodology is able to identify new concepts and rank the solutions according to their impact on the design quality of the system. Finally, our method has been evaluated by two independent designers who were asked to comment on the suggestions produced by our technique on their projects. The designers provided feedback on the ability of the method to identify new concepts and improve the design quality of the system in terms of cohesion.}, booktitle = {{IEEE} International Conference on Software Maintenance}, publisher = {{IEEE} Computer Society}, author = {Fokaefs, Marios and Tsantalis, Nikolaos and Chatzigeorgiou, Alexander and Sander, Jorg}, year = {2009}, keywords = {clustering, extract class, software clustering, software modules}, pages = {93--101}, annote = {Simple agglomerative clustering using a Jaccard distance. It's unclear exactly what is going into the property sets. After clustering, suggestions are ranked using the entity placement metric. Relevance: 5} }, @article{han_frequent_2007, title = {Frequent pattern mining: current status and future directions}, volume = {15}, issn = {1384-5810}, shorttitle = {Frequent pattern mining}, url = {http://portal.acm.org/citation.cfm?id=1275092.1275097}, doi = {10.1007/s10618-006-0059-1}, abstract = {Frequent pattern mining has been a focused theme in data mining research for over a decade. Abundant literature has been dedicated to this research and tremendous progress has been made, ranging from efficient and scalable algorithms for frequent itemset mining in transaction databases to numerous research frontiers, such as sequential pattern mining, structured pattern mining, correlation mining, associative classification, and frequent pattern-based clustering, as well as their broad applications. In this article, we provide a brief overview of the current status of frequent pattern mining and discuss a few promising research directions. We believe that frequent pattern mining research has substantially broadened the scope of data analysis and will have deep impact on data mining methodologies and applications in the long run. However, there are still some challenging research issues that need to be solved before frequent pattern mining can claim a cornerstone approach in data mining applications.}, number = {1}, journal = {Data Mining and Knowledge Discovery}, author = {Han, Jiawei and Cheng, Hong and Xin, Dong and Yan, Xifeng}, month = aug, year = {2007}, keywords = {association rules, data mining, frequent pattern mining, pattern analysis, pattern matching, survey}, pages = {55{\textendash}86} }, @inproceedings{zhang_automated_2008, title = {Automated aspect recommendation through clustering-based fan-in analysis}, isbn = {1527-1366}, abstract = {Identifying code implementing a crosscutting concern {(CCC)} automatically can benefit the maintainability and evolvability of the application. Although many approaches have been proposed to identify potential aspects, a lot of manual work is typically required before these candidates can be converted into refactorable aspects. In this paper, we propose a new aspect mining approach, called clustering-based fan-in analysis {(CBFA)}, to recommend aspect candidates in the form of method clusters, instead of single methods. {CBFA} uses a new lexical based clustering approach to identify method clusters and rank the clusters using a new ranking metric called cluster fan- in. Experiments on Linux and {JHotDraw} show that {CBFA} can provide accurate recommendations while improving aspect mining coverage significantly compared to other state-of-the-art mining approaches.}, booktitle = {23rd {IEEE/ACM} International Conference on Automated Software Engineering, {ASE} 2008}, author = {Zhang, Danfeng and Guo, Yao and Chen, Xiangqun}, year = {2008}, keywords = {aspects, clustering, crosscutting concern, metrics, software clustering}, pages = {278--287}, annote = {Clusters methods or functions to find cross-cutting concerns. The methods are clustered based on their parsed names. Relevance: 4} }, @phdthesis{shtern_methods_2010, address = {Toronto, Ontario}, type = {{PhD} Thesis}, title = {Methods for evaluating, selecting and improving software clustering algorithms}, abstract = {A common problem that the software industry has to face is the maintenance cost of industrial software systems. One of themain reasons for the high cost ofmaintenance is the inherent difficulty of understanding software systems that are large, complex, inconsistent (developed using mixed methodologies, have incomplete features) and integrated. One of the approaches that has been developed to deal with problems that arise from the sheer size and complexity of large software systems is software clustering. Decomposing a software system into smaller, more manageable subsystems can aid the process of understanding it significantly. Many software clustering algorithms that attempt to automatically construct decompositions of large pieces of software have been presented in the literature. Different algorithms construct different decompositions. Therefore, it is important to have methods that evaluate the quality of such automatic decompositions. This thesis presents approaches to a variety of problems associatedwith the evaluation of software clustering algorithms. Several methods that compare flat decompositions of software systems have been presented in the literature. We introduce the first methods for the evaluation of nested decompositions. We also introduce a set of indicators that measure structure discrepancy between softiv ware decompositions. The structure indicators can augment the evaluation picture provided by traditional comparison methods, such as {MoJoFM} and {KE.} In addition, we introduce and quantify the notion of clustering algorithm comparability. It is based on the concept that algorithms with different objectives should not be directly compared. We also introduce a novel process for software clustering evaluation that utilizes multiple simulated authoritative decompositions. The big selection of software clustering algorithms raises the question of how to select a software clustering algorithm that is best suited for a specific software system. In this thesis, we introduce a method for the selection of a software clustering algorithm for specific needs. The proposed algorithm selection method is based on a newly introduced formal description template for software clustering algorithms. Using the same template, we also introduce a method for software clustering algorithm improvement.}, school = {York University}, author = {Shtern, Mark}, year = {2010}, keywords = {cluster evaluation, clustering, metrics, software clustering, thesis} }, @inproceedings{czibula_hierarchical_2007, title = {A Hierarchical Clustering Algorithm for Software Systems Design Improvement}, booktitle = {Proceedings of the First International Conference on Knowledge Engineering: Principles and Techniques}, author = {Czibula, {Istv\'{a}n-Gergely} and Serban, Gabriela}, year = {2007}, keywords = {clustering, refactoring, software clustering}, pages = {316--323} }, @article{kagdi_survey_2007, title = {A survey and taxonomy of approaches for mining software repositories in the context of software evolution}, volume = {19}, url = {http://portal.acm.org/citation.cfm?id=1345056.1345057}, abstract = {A comprehensive literature survey on approaches for mining software repositories {(MSR)} in the context of software evolution is presented. In particular, this survey deals with those investigations that examine multiple versions of software artifacts or other temporal information. A taxonomy is derived from the analysis of this literature and presents the work via four dimensions: the type of software repositories mined (what), the purpose (why), the adopted/invented methodology used (how), and the evaluation method (quality). The taxonomy is demonstrated to be expressive (i.e., capable of representing a wide spectrum of {MSR} investigations) and effective (i.e., facilitates similarities and comparisons of {MSR} investigations). Lastly, a number of open research issues in {MSR} that require further investigation are identified.}, number = {2}, journal = {J. Softw. Maint. Evol.}, author = {Kagdi, Huzefa and Collard, Michael L. and Maletic, Jonathan I.}, year = {2007}, keywords = {multi-version analysis, software evolution}, pages = {77--131} }, @book{bloch_effective_2008, title = {Effective Java}, isbn = {9780321356680}, abstract = {Are you looking for a deeper understanding of the Java" programming language so that you can write code that is clearer, more correct, more robust, and more reusable? Look no further! Effective Java", Second Edition,brings together seventy-eight indispensable programmer's rules of thumb: working, best-practice solutions for the programming challenges you encounter every day. This highly anticipated new edition of the classic, Jolt Award-winning work has been thoroughly updated to cover Java {SE} 5 and Java {SE} 6 features introduced since the first edition. Bloch explores new design patterns and language idioms, showing you how to make the most of features ranging from generics to enums, annotations to autoboxing. Each chapter in the book consists of several {\textquotedblleft}items{\textquotedblright} presented in the form of a short, standalone essay that provides specific advice, insight into Java platform subtleties, and outstanding code examples. The comprehensive descriptions and explanations for each item illuminate what to do, what not to do, and why. Highlights include: New coverage of generics, enums, annotations, autoboxing, the for-each loop, varargs, concurrency utilities, and much more Updated techniques and best practices on classic topics, including objects, classes, libraries, methods, and serialization How to avoid the traps and pitfalls of commonly misunderstood subtleties of the language Focus on the language and its most fundamental libraries: java.lang, java.util, and, to a lesser extent, java.util.concurrent and java.io Simply put, Effective Java", Second Edition,presents the most practical, authoritative guidelines available for writing efficient, well-designed programs.}, publisher = {{Addison-Wesley} Professional}, author = {Bloch, Joshua}, year = {2008}, annote = {Uses "value classes"} }, @phdthesis{cassell_tools_1985, type = {Master's thesis}, title = {Tools for the analysis of large Prolog programs}, school = {University of Texas - Austin}, author = {Cassell, Keith A}, year = {1985}, annote = {Also available as {MCC} Technical Report Number {DB-171-85.} 1985.} }, @article{czibula_improving_2006, title = {Improving Systems Design using a Clustering Approach}, volume = {6}, abstract = {Clustering is a division of data into groups of similar objects, a data mining activity that aims to differentiate groups inside a given set of objects, with respect to a set of relevant attributes of the analyzed objects. Refactoring is the process of improving the design of software systems. Its goal is to change a software system in such a way that it does not alter the external behavior of the code, but improves its internal structure ([9]). This paper aims at presenting a new approach for improving systems design using clustering. Clustering is used in order to recondition the class structure of a software system. The proposed approach can be useful for assisting software engineers in their daily works of refactoring software systems. We evaluate our approach using the open source case study {JHotDraw} ([18]) based on two newly defined measures. A comparison with previous approaches is also provided.}, number = {12}, journal = {{IJCSNS} International Journal of Computer Science and Network Security}, author = {Czibula, {Istv\'{a}n-Gergely} and Serban, Gabriela}, year = {2006}, keywords = {clustering, refactoring, software clustering, systems reengineering}, pages = {40--49}, annote = {The cardinality of the vector space is the number of application classes from the software system. For each class, method, and attribute, the Jaccard distances to each classes are calculated.} }, @article{dit_feature_????, title = {Feature location in source code: a taxonomy and survey}, issn = {1532-0618}, shorttitle = {Feature location in source code}, url = {http://onlinelibrary.wiley.com/doi/10.1002/smr.567/abstract}, doi = {10.1002/smr.567}, abstract = {Feature location is the activity of identifying an initial location in the source code that implements functionality in a software system. Many feature location techniques have been introduced that automate some or all of this process, and a comprehensive overview of this large body of work would be beneficial to researchers and practitioners. This paper presents a systematic literature survey of feature location techniques. Eighty-nine articles from 25 venues have been reviewed and classified within the taxonomy in order to organize and structure existing work in the field of feature location. The paper also discusses open issues and defines future directions in the field of feature location. Copyright {\textcopyright} 2011 John Wiley \& Sons, Ltd.}, journal = {Journal of Software Maintenance and Evolution: Research and Practice}, author = {Dit, Bogdan and Revelle, Meghan and Gethers, Malcom and Poshyvanyk, Denys}, keywords = {concept location, Feature location, program comprehension, software maintenance and evolution} }, @article{lakhotia_unified_1997, title = {A unified framework for expressing software subsystem classification techniques}, volume = {36}, issn = {0164-1212}, url = {http://www.sciencedirect.com/science/article/B6V0N-3WJNWVC-2/2/c47b54e177549f1b7a1744499f98f07d}, doi = {10.1016/0164-1212(95)00098-4}, abstract = {The architecture of a software system classifies its components into subsystems and describes the relationships between the subsystems. The information contained in such an abstraction is of immense significance in various software maintenance activities. There is considerable interest in extracting the architecture of a software system from its source code and, hence, in techniques that classify the components of a program into subsystems. Techniques for classifying subsystems presented in the literature differ in the type of components they place in a subsystem and the information they use to identify related components. However, these techniques have been presented using different terminology and symbols, making it harder to perform comparative analyses. This article presents a unified framework for expressing techniques of classifying subsystems of a software system. The framework is comprised of a consistent set of terminology, notation, and symbols that may be used to describe the input, output, and processing performed by these techniques. Using this framework, several subsystem classification techniques have been reformulated. This reformulation makes it easier to compare these techniques and provides a first step towards evaluating their relative effectiveness.}, number = {3}, journal = {Journal of Systems and Software}, author = {Lakhotia, Arun}, month = mar, year = {1997}, keywords = {clustering, frameworks, software clustering}, pages = {211--231} }, @article{chen_new_1993, title = {A new metric for object-oriented design}, volume = {35}, number = {4}, journal = {Information and Software Technology}, author = {Chen, {J.-Y.} and Lu, {J.-F.}}, month = apr, year = {1993}, keywords = {cohesion, metrics}, pages = {232--240} }, @article{murphy-hill_refactoring_2008, title = {Refactoring Tools: Fitness for Purpose}, volume = {25}, issn = {0740-7459}, shorttitle = {Refactoring Tools}, abstract = {Refactoring tools can improve the speed and accuracy with which developers create and maintain software{\textemdash}but only if they are used. In practice, tools are not used as much as they could be; this seems to be because sometimes they do not align with the refactoring tactic preferred by most programmers, a tactic the authors call "floss refactoring." They propose five principles that characterize successful floss-refactoring tools{\textemdash}principles that can help programmers to choose the most appropriate refactoring tools and also help toolsmiths to design tools that fit the programmer's purpose.}, number = {5}, journal = {{IEEE} Software}, author = {{Murphy-Hill}, Emerson and Black, Andrew P.}, year = {2008}, keywords = {refactoring}, pages = {38--44} }, @book{patrick_professional_2010, title = {Professional Oracle {WebLogic} Server}, isbn = {9781118057360}, abstract = {Authoritative guide to Oracle {WebLogic} Server-from Oracle {insidersIf} you're an experienced Java developer who wants to expand your skills, {"Professional"} {"Oracle} {WebLogic} Server" is the perfect guide for you. This book is written by a top-notch author team that that includes one of the lead architects from Oracle's Fusion Middleware Development Architects {team.Follow} their best practices, workarounds, and sound techniques and confidently develop even the most mission-critical applications with {WebLogic} {Server.This} book fully covers {WebLogic} Server 11"g, " including the new features of both {JEE} 5 and {WebLogic} Server, as well as {JEE} 5 annotations, Spring, {JPA}, {JAX-WS}, {JMS} {Store-And-Forward}, {SAML} support, and the {WLST} administrative scripting {tool.This} book is the authoritative guide {toChoosing} a Web application {architectureBest} practices for development and production {environmentsDesigning} an Java {EE} {applicationBuilding} Enterprise {JavaBeans} in {WebLogic} {ServerBuilding} an {EJB} {applicationPackaging} and deploying {WebLogic} web {applicationsDeveloping} and deploying web {servicesUsing} {WebLogic} {JMSUsing} {WebLogic} {securityAdministering} and deploying applications in {WebLogic} {ServerOptimizing} {WebLogic} Server performance}, publisher = {John Wiley and Sons}, author = {Patrick, Robert and Nyberg, Gregory and Aston, Philip}, month = dec, year = {2010}, annote = {Uses "data transfer class"} }, @book{zakhour_java_2006, edition = {4th}, title = {The Java Tutorial: A Short Course on the Basics, 4th Edition}, isbn = {0321334205}, shorttitle = {The Java Tutorial}, publisher = {Prentice Hall}, author = {Zakhour, Sharon and Hommel, Scott and Royal, Jacob and Rabinovitch, Isaac and Risser, Tom and Hoeber, Mark}, month = oct, year = {2006} }, @article{ottenstein_algorithmic_1976, title = {An algorithmic approach to the detection and prevention of plagiarism}, volume = {8}, url = {http://portal.acm.org/citation.cfm?id=382462}, doi = {10.1145/382222.382462}, number = {4}, journal = {{SIGCSE} Bull.}, author = {Ottenstein, K. J.}, year = {1976}, keywords = {clone detection, plagiarism}, pages = {30--41} }, @article{choi_extracting_1990, title = {Extracting and Restructuring the Design of Large Systems}, volume = {7}, url = {http://portal.acm.org/citation.cfm?id=624909}, abstract = {Extraction of the structural and, to a lesser degree, functional and dynamic properties of systems composed of modules and subsystems is treated. The process is equivalent to reverse engineering a system-level design description. The approach used is to map the resource exchange among modules and then derive a hierarchical design description using a system-restructuring algorithm. The medium for the design description is a module interconnection language, {NuMIL.} The performance of the algorithm shows that it is practical.}, number = {1}, journal = {{IEEE} Software}, author = {Choi, Song C. and Scacchi, Walt}, year = {1990}, keywords = {module interconnection language, restructuring, reverse engineering, subsystem identification}, pages = {66--71} }, @article{meyers_empirical_2007, title = {An empirical study of slice-based cohesion and coupling metrics}, volume = {17}, issn = {{1049331X}}, url = {http://portal.acm.org/citation.cfm?doid=1314493.1314495}, doi = {10.1145/1314493.1314495}, abstract = {Software reengineering is a costly endeavor, due in part to the ambiguity of where to focus reengineering effort. Coupling and Cohesion metrics, particularly quantitative cohesion metrics, have the potential to aid in this identification and to measure progress. The most extensive work on such metrics is with slice-based cohesion metrics. While their use of semantic dependence information should make them an excellent choice for cohesion measurement, their wide spread use has been impeded in part by a lack of empirical study. Recent advances in software tools make, for the first time, a large-scale empirical study of slice-based cohesion and coupling metrics possible. Four results from such a study are presented. First, {\textquotedblleft}head-to-head{\textquotedblright} qualitative and quantitative comparisons of the metrics identify which metrics provide similar views of a program and which provide unique views of a program. This study includes statistical analysis showing that slice-based metrics are not proxies for simple size-based metrics such as lines of code. Second, two longitudinal studies show that slice-based metrics quantify the deterioration of a program as it ages. This serves to validate the metrics: the metrics quantify the degradation that exists during development; turning this around, the metrics can be used to measure the progress of a reengineering effort. Third, baseline values for slice-based metrics are provided. These values act as targets for reengineering efforts with modules having values outside the expected range being the most in need of attention. Finally, slice-based coupling is correlated and compared with slice-based cohesion.}, number = {1}, journal = {{ACM} Transactions on Software Engineering and Methodology}, author = {Meyers, Timothy M. and Binkley, David}, month = dec, year = {2007}, pages = {1--27} }, @article{rao_identifying_2011, title = {Identifying Clusters of Concepts in a Low Cohesive Class for Extract Class Refactoring Using Metrics Supplemented Agglomerative Clustering Technique}, volume = {8}, issn = {1694-0814}, abstract = {Object oriented software with low cohesive classes can increase maintenance cost. Low cohesive classes are likely to be introduced into the software during initial design due to deviation from design principles and during evolution due to software deterioration. Low cohesive class performs operations that should be done by two or more classes. The low cohesive classes need to be identified and refactored using extract class refactoring to improve the cohesion. In this regard, two aspects are involved; the first one is to identify the low cohesive classes and the second one is to identify the clusters of concepts in the low cohesive classes for extract class refactoring. In this paper, we propose metrics supplemented agglomerative clustering technique for covering the above two aspects. The proposed metrics are validated using Weyuker{\textquoteright}s properties. The approach is applied successfully on two examples and on a case study.}, number = {5}, journal = {International Journal of Computer Science Issues}, author = {Rao, A. Ananda and Reddy, N. Narendar}, month = sep, year = {2011}, pages = {185--194} }, @article{al_dallal_mathematical_2010, title = {Mathematical Validation of {Object-Oriented} Class Cohesion Metrics}, volume = {4}, issn = {1998-4308}, abstract = {Class cohesion is an object-oriented software quality attribute and refers to the extent to which the members of a class are related. Software developers use class cohesion measures to assess the quality of their products and to guide the restructuring of poorly designed classes. Several class cohesion metrics are proposed in the literature, and a few of them are mathematically validated against the necessary properties of class cohesion. Metrics that violate class cohesion properties are not well defined, and their utility as indictors of the relatedness of class members is questionable. The purpose of this paper is to mathematically validate sixteen class cohesion metrics using class cohesion properties. Results show that metrics differ considerably in satisfying the cohesion properties; some of them satisfy all properties, while others satisfy none.}, number = {2}, journal = {International Journal of Computers}, author = {Al Dallal, Jehad}, year = {2010}, keywords = {cohesion, metrics, metrics validation}, pages = {45--52}, annote = {Assesses the validity of sixteen class cohesion metrics using the four properties from Briand Relevance: 4} }, @inproceedings{batagelj_pajek_2002, title = {Pajek - analysis and visualization of large networks}, booktitle = {Graph Drawing}, author = {Batagelj, V. and Mrvar, A.}, year = {2002}, keywords = {graphs, visualization}, pages = {8{\textendash}11} }, @incollection{staab_verification_2006, address = {Berlin, Heidelberg}, title = {Verification and Refactoring of Ontologies with Rules}, volume = {4248}, isbn = {978-3-540-46363-4, 978-3-540-46365-8}, url = {http://www.springerlink.com/content/f712678m71813116/}, booktitle = {Managing Knowledge in a World of Networks}, publisher = {Springer}, author = {Baumeister, Joachim and Seipel, Dietmar}, editor = {Staab, Steffen and Sv\'{a}tek, Vojt\v{e}ch}, year = {2006}, keywords = {ontologies, refactoring}, pages = {82--95} }, @article{chidamber_towards_1991, title = {Towards a metrics suite for object oriented design}, volume = {26}, url = {http://portal.acm.org/citation.cfm?id=117970}, doi = {10.1145/118014.117970}, abstract = {While software metrics are a generally desirable feature in the software management functions of project planning and project evaluation, they are of especial importance with a new technology such as the object-oriented approach. This is due to the significant need to train software engineers in generally accepted object-oriented principles. This paper presents theoretical work that builds a suite of metrics for object-oriented design. In particular, these metrics are based upon measurement theory and are informed by the insights of experienced object-oriented software developers. The proposed metrics are formally evaluated against a widelyaccepted list of software metric evaluation criteria.}, number = {11}, journal = {{SIGPLAN} Not.}, author = {Chidamber, Shyam R. and Kemerer, Chris F.}, year = {1991}, keywords = {cohesion, coupling, metrics, {OOD}}, pages = {197--211} }, @inproceedings{cassell_towards_2009, address = {Auckland, {NZ}}, title = {Towards automating class-splitting using betweenness clustering}, isbn = {978-1-4244-5259-0}, doi = {10.1109/ASE.2009.21}, abstract = {Large, unwieldy classes are a significant maintenance problem. Programmers dislike them because the fundamental logic is often obscured, making them hard to understand and modify. This paper proposes a solution - a semi-automatic technique for splitting large classes into smaller, more cohesive ones. The core of the technique is the use of betweenness clustering to identify the best way of partitioning a class. This turned a tedious manual process into a quick and simple semi-automated one in roughly one third of the cases we examined.}, booktitle = {24th {IEEE/ACM} International Conference on Automated Software Engineering}, author = {Cassell, Keith and Andreae, Peter and Groves, Lindsay and Noble, James}, month = nov, year = {2009}, keywords = {betweenness, clustering, cohesion, graphs, maintainability, refactoring, software clustering}, pages = {595--599} }, @article{ducasse_class_2005, title = {The Class Blueprint: Visually Supporting the Understanding of Classes}, volume = {31}, shorttitle = {The Class Blueprint}, url = {http://portal.acm.org/citation.cfm?id=1048912&dl=GUIDE&coll=GUIDE&CFID=22242903&CFTOKEN=73634498}, abstract = {Understanding source code is an important task in the maintenance of software systems. Legacy systems are not only limited to procedural languages, but are also written in object-oriented languages. In such a context, understanding classes is a key activity as they are the cornerstone of the object-oriented paradigm and the primary abstraction from which applications are built. Such an understanding is however difficult to obtain because of reasons such as the presence of late binding and inheritance. A first level of class understanding consists of the understanding of its overall structure, the control flow among its methods, and the accesses on its attributes. We propose a novel visualization of classes called class blueprint that is based on a semantically enriched visualization of the internal structure of classes. This visualization allows a software engineer to build a first mental model of a class that he validates via opportunistic code-reading. Furthermore, we have identified visual patterns that represent recurrent situations and as such convey additional information to the viewer. The contributions of this article are the class blueprint, a novel visualization of the internal structure of classes, the identification of visual patterns, and the definition of a vocabulary based on these visual patterns. We have performed several case studies of which one is presented in depth, and validated the usefulness of the approach in a controlled experiment.}, number = {1}, journal = {{IEEE} Trans. Softw. Eng.}, author = {Ducasse, St\'{e}phane}, year = {2005}, keywords = {reverse engineering, visual patterns, visualization}, pages = {75--90}, annote = {This paper is primarily concerned with visualizing the interrelationships of methods and attributes within classes. They provide representative call graphs. Relevance: 4} }, @misc{schaub_eclipse_2008, title = {Eclipse Corner Article: Creating Database Web Applications with Eclipse}, url = {http://www.eclipse.org/articles/article.php?file=Article-EclipseDbWebapps/index.html}, author = {Schaub, Stephen}, month = jan, year = {2008}, note = {Accessed 2010-04-28}, keywords = {database, Eclipse}, howpublished = {{http://www.eclipse.org/articles/article.php?file=Article-EclipseDbWebapps/index.html}} }, @article{briand_property-based_1996, title = {Property-based software engineering measurement}, volume = {22}, abstract = {Little theory exists in the field of software system measurement. Concepts such as complexity, coupling, cohesion or even size are very often subject to interpretation and appear to have inconsistent definitions in the literature. As a consequence, there is little guidance provided to the analyst attempting to define proper measures for specific problems. Many controversies in the literature are simply misunderstandings and stem from the fact that some people talk about different measurement concepts under the same label (complexity is the most common case). There is a need to define unambiguously the most important measurement concepts used in the measurement of software products. One way of doing so is to define precisely what mathematical properties characterize these concepts, regardless of the specific software artifacts to which these concepts are applied. Such a mathematical framework could generate a consensus in the software engineering community and provide a means for better communication among researchers, better guidelines for analysts, and better evaluation methods for commercial static analyzers for practitioners. In this paper, we propose a mathematical framework which is generic, because it is not specific to any particular software artifact, and rigorous, because it is based on precise mathematical concepts. We use this framework to propose definitions of several important measurement concepts (size, length, complexity, cohesion, coupling). It does not intend to be complete or fully objective; other frameworks could have been proposed and different choices could have been made. However, we believe that the formalisms and properties we introduce are convenient and intuitive. This framework contributes constructively to a firmer theoretical ground of software measurement.}, number = {1}, journal = {{IEEE} Transactions on Software Engineering}, author = {Briand, Lionel C. and Morasca, S. and Basili, V. R.}, year = {1996}, keywords = {cohesion, metrics, metrics validation}, pages = {68--86} }, @inproceedings{kanellopoulos_interpretation_2008, title = {Interpretation of Source Code Clusters in Terms of the {ISO/IEC-9126} Maintainability Characteristics}, volume = {2008}, abstract = {Clustering is a data mining technique that allows the grouping of data points on the basis of their similarity with respect to multiple dimensions of measurement. It has also been applied in the software engineering domain, in particular to support software quality assessment based on source code metrics. Unfortunately, since clusters emerge from metrics at the source code level, it is difficult to interpret the significance of clusters at the level of the quality of the entire system. In this paper, we propose a method for interpreting source code clusters using the {ISO/IEC} 9126 software product quality model. Several methods have been proposed to perform quantitative assessment of software systems in terms of the quality characteristics defined by {ISO/IEC} 9126. These methods perform mappings of low-level source code metrics to high-level quality characteristics by various aggregation and weighting procedures. We applied such a method to obtain quality profiles at various abstraction levels for each generated source code cluster. Subsequently, the plethora of quality profiles obtained is visualized such that conclusions about different quality problems in various clusters can be obtained at a glance.}, booktitle = {Proceedings of the 12th European Conference on Software Maintenance and Reengineering}, author = {Kanellopoulos, Y. and Tjortjis, C. and Heitlager, I. and Visser, J.}, year = {2008}, keywords = {clustering, maintainability, maintenance}, pages = {63--72} }, @misc{beck_bad_????, title = {Bad Smells in Code}, url = {http://sourcemaking.com/refactoring/bad-smells-in-code}, journal = {Bad Smells Design Pattern in Code}, author = {Beck, Kent and Fowler, Martin}, howpublished = {http://sourcemaking.com/refactoring/bad-smells-in-code} }, @inproceedings{de_lucia_using_2008, address = {Beijing}, title = {Using structural and semantic metrics to improve class cohesion}, abstract = {Several refactoring methods have been proposed in the literature to improve the cohesion of classes. Very often, refactoring operations are guided by cohesion metrics based on the structural information of the source code, such as attribute references in methods. In this paper we present a novel approach to guide the extract class refactoring {(M.} Fowler, 1999), taking into account structural and semantic cohesion metrics. The proposed approach has been evaluated in a case study conducted on {JHotDraw}, an open source software system. The achieved results revealed that the performance achieved with the proposed approach significantly outperforms the results achieved with methods considering only structural or semantic information. The proposed approach has also been integrated in the Eclipse platform.}, booktitle = {{IEEE} International Conference on Software Maintenance}, author = {De Lucia, A. and Oliveto, R. and Vorraro, L.}, month = sep, year = {2008}, keywords = {cohesion, extract class, metrics, refactoring, semantics}, pages = {27 -- 36}, annote = {Discusses using both structural and semantic similarity to guide an extract class refactoring. They use max-flow/min cut to separate a graph into two. The graph is composed of nodes representing members; each pair is connected by edges that are weighted by the similarity measure. The graph does *not* maintain the structural connections between methods, although the connections should at least be partially reflected in the weightings. Relevance: 5} }, @article{wu_top_2007, title = {Top 10 algorithms in data mining}, volume = {14}, issn = {0219-1377}, url = {http://dx.doi.org/10.1007/s10115-007-0114-2}, doi = {http://dx.doi.org/10.1007/s10115-007-0114-2}, abstract = {This paper presents the top 10 data mining algorithms identified by the {IEEE} International Conference on Data Mining {(ICDM)} in December 2006: C4.5, {k-Means}, {SVM}, Apriori, {EM}, {PageRank}, {AdaBoost}, {kNN}, Naive Bayes, and {CART.} These top 10 algorithms are among the most influential data mining algorithms in the research community. With each algorithm, we provide a description of the algorithm, discuss the impact of the algorithm, and review current and further research on the algorithm. These 10 algorithms cover classification, clustering, statistical learning, association analysis, and link mining, which are all among the most important topics in data mining research and development.}, journal = {Knowledge and Information Systems}, author = {Wu, Xindong and Kumar, Vipin and Ross Quinlan, J. and Ghosh, Joydeep and Yang, Qiang and Motoda, Hiroshi and {McLachlan}, Geoffrey J and Ng, Angus and Liu, Bing and Yu, Philip S and Zhou, {Zhi-Hua} and Steinbach, Michael and Hand, David J and Steinberg, Dan}, month = dec, year = {2007}, note = {{ACM} {ID:} 1327436}, keywords = {algorithms, data mining, theory}, pages = {1{\textendash}37} }, @book{beck_extreme_2000, title = {Extreme programming explained: embrace change}, isbn = {9780201616415}, shorttitle = {Extreme programming explained}, abstract = {The problem. Risk: The basic problem. A development episode. Economics of software development. Four variables. Cost of change. Learning to drive. Four values. Basic principles. Back to basics. The solution. Quick overview. How could this work? Management strategy. Facilities strategy. Splitting business and technical responsability. Planning strategy. Development strategy. Design strategy. Testing strategy. Implementing {XP.} Adopting {XP.} Retroffiting {XP.} Lifecycle of an ideal {XP} project. Roles for people. 20-80 rule. What makes {XP} hard. When you shouldn't try {XP.} {XP} at work. Conclusion.}, publisher = {{Addison-Wesley} Professional}, author = {Beck, Kent}, year = {2000}, keywords = {agile, {XP}} }, @book{stevens_structured_1979, address = {Upper Saddle River, {NJ}, {USA}}, title = {Structured Design}, publisher = {Yourdon Press}, author = {Stevens, Wayne and Myers, Glenford and Constantine, Larry}, year = {1979} }, @inproceedings{tonelli_analysis_2011, title = {An analysis of {SNA} metrics on the Java Qualitas Corpus}, isbn = {9781450305594}, url = {http://dl.acm.org/citation.cfm?id=1953382}, doi = {10.1145/1953355.1953382}, publisher = {{ACM} Press}, author = {Tonelli, Roberto and Concas, Giulio and Marchesi, Michele and Murgia, Alessandro}, year = {2011}, pages = {205--213} }, @misc{tairas_clone_2008, title = {Clone Detection Literature}, url = {http://students.cis.uab.edu/tairasr/clones/literature/}, publisher = {University of Alabama at Birmingham}, author = {Tairas, Robert}, year = {2008}, keywords = {clone detection, clones, survey} }, @article{buckley_towards_2005, title = {Towards a taxonomy of software change: Research Articles}, volume = {17}, shorttitle = {Towards a taxonomy of software change}, url = {http://portal.acm.org/citation.cfm?id=1090746}, abstract = {Previous taxonomies of software change have focused on the purpose of the change (i.e., the why) rather than the underlying mechanisms. This paper proposes a taxonomy of software change based on characterizing the mechanisms of change and the factors that influence these mechanisms. The ultimate goal of this taxonomy is to provide a framework that positions concrete tools, formalisms and methods within the domain of software evolution. Such a framework would considerably ease comparison between the various mechanisms of change. It would also allow practitioners to identify and evaluate the relevant tools, methods and formalisms for a particular change scenario. As an initial step towards this taxonomy, the paper presents a framework that can be used to characterize software change support tools and to identify the factors that impact on the use of these tools. The framework is evaluated by applying it to three different change support tools and by comparing these tools based on this analysis. Copyright {\textcopyright} 2005 John Wiley \& Sons, Ltd.}, number = {5}, journal = {J. Softw. Maint. Evol.}, author = {Buckley, Jim and Mens, Tom and Zenger, Matthias and Rashid, Awais and Kniesel, G\"{u}nter}, year = {2005}, pages = {309--332} }, @article{ott_slice_1993, title = {Slice Based Metrics for Estimating Cohesion}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.55.8094}, doi = {10.1.1.55.8094}, journal = {{IN} {PROCEEDINGS} {OF} {THE} {IEEE-CS} {INTERNATIONAL} {METRICS} {SYMPOSIUM}}, author = {Ott, Linda M and Thuss, Jeffrey J}, year = {1993}, keywords = {aspects, cohesion, metrics}, pages = {71---81} }, @article{kuhn_abstract_2006, title = {Abstract Syntax Tree}, journal = {Eclipse Corner}, author = {Kuhn, Thomas and Thomann, Olivier}, month = nov, year = {2006}, keywords = {abstract syntax trees, Eclipse}, annote = {Describes {AST} {APIs} in the context of an example. Relevance: 5 } }, @inproceedings{lindsay_does_2010, address = {Cape Town, South Africa}, title = {Does size matter?: a preliminary investigation of the consequences of powerlaws in software}, isbn = {978-1-60558-976-3}, shorttitle = {Does size matter?}, url = {http://portal.acm.org/citation.cfm?doid=1809223.1809226}, doi = {10.1145/1809223.1809226}, abstract = {There is increasing evidence that many object-oriented software size metrics are characterised by scale-free, powerlaw distributions. This means programs will have arbitrarily large components, and the size of the largest component will increase as programs' overall size increases. This directly contradicts a crucial assumption of object-oriented design --- that large programs can be build by combining many small components.}, booktitle = {Proceedings of the 2010 {ICSE} Workshop on Emerging Trends in Software Metrics}, publisher = {{ACM}}, author = {Lindsay, Joshua and Noble, James and Tempero, Ewan}, year = {2010}, keywords = {empirical, metrics, {OOD}, powerlaws, size}, pages = {16--23} }, @inproceedings{shtern_comparability_2010, address = {Los Alamitos, {CA}, {USA}}, title = {On the Comparability of Software Clustering Algorithms}, doi = {http://doi.ieeecomputersociety.org/10.1109/ICPC.2010.25}, abstract = {Evaluation of software clustering algorithms is typically done by comparing the clustering results to an authoritative decomposition prepared manually by a system expert. A well-known drawback of this approach is the fact that there are many, equally valid ways to decompose a software system, since different clustering objectives create different decompositions. Evaluating all clustering algorithms against a single authoritative decomposition can lead to biased results. In this paper, we introduce and quantify the notion of clustering algorithm comparability. It is based on the concept that algorithms with different objectives should not be directly compared. Not surprisingly, we find that several of the published algorithms in the literature are not comparable to each other.}, booktitle = {International Conference on Program Comprehension}, publisher = {{IEEE} Computer Society}, author = {Shtern, Mark and Tzerpos, Vassilios}, year = {2010}, keywords = {cluster evaluation, clustering, software clustering}, pages = {64--67} }, @inproceedings{baxter_automated_2005, address = {October 24-27, 2005}, title = {Automated Program and Model Transformation Technology}, author = {Baxter, Ira and Gray, Jeff}, month = oct, year = {2005}, keywords = {refactoring}, annote = {This tutorial discusses program transformations via rewrite rules, including the mathematical basis and the desirable components of a transformation system. Relevance: 5} }, @book{newman_networks:_2010, edition = {1}, title = {Networks: An Introduction}, isbn = {0199206651}, shorttitle = {Networks}, publisher = {Oxford University Press, {USA}}, author = {Newman, Mark}, month = may, year = {2010} }, @inproceedings{tempero_qualitas_2010, title = {Qualitas Corpus: A Curated Collection of Java Code for Empirical Studies}, booktitle = {2010 Asia Pacific Software Engineering Conference {(APSEC2010)}}, author = {Tempero, Ewan and Anslow, Craig and Dietrich, Jens and Han, Ted and Li, Jing and Lumpe, Markus and Melton, Hayden and Noble, James}, month = dec, year = {2010}, keywords = {empirical} }, @book{saunders_intellij_2006, title = {{IntelliJ} {IDEA} in Action}, isbn = {1932394443}, publisher = {Manning Publications}, author = {Saunders, Stephen and Fields, Duane K. and Belayev, Eugene}, month = mar, year = {2006}, keywords = {{IDE}} }, @inproceedings{counsell_common_2006, address = {Rio de Janeiro, Brazil}, title = {Common refactorings, a dependency graph and some code smells}, url = {http://portal.acm.org/citation.cfm?id=1159777&dl=ACM&coll=DL&CFID=112423851&CFTOKEN=60861252}, doi = {10.1145/1159733.1159777}, abstract = {Refactoring, as a software engineering discipline has emerged over recent years to become an important aspect of maintaining software. Refactoring refers to the restructuring of software according to specific mechanics and principles. In this paper, we describe an analysis of the results from a tool whose purpose was to identify and extract refactorings from seven open-source Java systems. In particular, we analyzed the mechanics of the most commonly and least commonly applied refactorings to try and account for their frequency. Results showed the most common refactorings of the fifteen coined a {{\textquoteleft}Gang} of Six{\textquoteright}, to be generally those with a high in-degree and low out-degree when mapped on a dependency graph; the same refactorings also featured strongly in the remedying of bad code smells. Remarkably and surprisingly, inheritance and encapsulation- based refactorings were found to have been applied relatively infrequently - we offer explanations for why this may be the case. The paper thus identifies {\textquoteleft}core{\textquoteright} refactorings central to many of the changes made by developers on open-source systems. While we can not guarantee that developers consciously undertake refactoring in any sense, the empirical results demonstrate that simple renaming and moving fields/methods between classes are common components of open-source system re-engineering. From a wider software engineering perspective, knowledge of what a modification will incur in likely sub-tasks is of value to developers whether working on open-source or other forms of software.}, booktitle = {Proceedings of the 2006 {ACM/IEEE} International symposium on empirical software engineering - {ISESE} '06}, author = {Counsell, S. and Hassoun, Y. and Loizou, G. and Najjar, R.}, year = {2006}, keywords = {graphs, refactoring, smells}, pages = {288} }, @inproceedings{wiggerts_using_1997, title = {Using Clustering Algorithms in Legacy Systems Remodularization}, isbn = {0-8186-8162-4}, url = {http://portal.acm.org/citation.cfm?id=836999}, abstract = {Incited by the observation that cluster analysis and the remodularization of software systems solve similar problems, we have done research in both these areas in order to provide theoretical background for the application of cluster analysis in systems remodularization. In this article, we present an overview of cluster analysis and of systems remodularization. It appears that system remodularization techniques often either reinvent clustering techniques or could be augmented by them. We also give directions for further research.}, booktitle = {Proceedings of the Fourth Working Conference on Reverse Engineering {(WCRE} '97)}, publisher = {{IEEE} Computer Society}, author = {Wiggerts, T. A.}, year = {1997}, keywords = {clustering, restructuring, software clustering, software modules, subsystem identification}, pages = {33} }, @inproceedings{bieman_cohesion_1995, address = {Seattle, Washington, United States}, series = {{SSR} '95}, title = {Cohesion and reuse in an object-oriented system}, volume = {20}, isbn = {0-89791-739-1}, url = {http://portal.acm.org/citation.cfm?doid=223427.211856}, doi = {10.1145/223427.211856}, abstract = {We define and apply two new measures of object-oriented class cohesion to a reasonably large C++ system. We find that most of the classes are quite cohesive, but that the classes that are reused more frequently via inheritance exhibit clearly lower cohesion.}, booktitle = {{SIGSOFT} Software Eng. Notes, Proceedings of the 1995 Symposium on Software Reusability}, publisher = {{ACM}}, author = {Bieman, James M. and Kang, {Byung-Kyoo}}, year = {1995}, keywords = {cohesion, inheritance, metrics}, pages = {259--262}, annote = {Define {TCC} and {LCC.} Theses are {C\&K} like, but include indirectly accessed fields, exclude constructors as arbitrary cohesion raisers. They mention that their metrics can be used in 3 variations - with all inherited members, with no inherited members, or with only inheriting fields. {LCC} was picked as the best cohesion metric in Etzkorn's study.} }, @inproceedings{walkinshaw_understanding_2005, title = {Understanding object-oriented source code from the behavioural perspective}, isbn = {1092-8138}, abstract = {Comprehension is a key activity that underpins a variety of software maintenance and engineering tasks. The task of understanding object-oriented systems is hampered by the fact that the code segments that are related to a user-level function tend to be distributed across the system. We introduce a tool-supported code extraction technique that addresses this issue. Given a minimal amount of information about a behavioural element of the system that is of interest (such as a use-case), it extracts a trail of the methods (and method invocations) through the system that are needed in order to achieve an understanding of the implementation of the element of interest. We demonstrate the feasibility of our approach by implementing it as part of a code extraction tool, presenting a case study and evaluating the approach and tool against a set of established criteria for program comprehension tools.}, booktitle = {Program Comprehension, 2005. {IWPC} 2005. Proceedings. 13th International Workshop on}, author = {Walkinshaw, N. and Roper, M. and Wood, M.}, year = {2005}, keywords = {hammock graphs, maintenance, {OOP}, program slicing, reverse engineering, slicing}, pages = {215--224} }, @inproceedings{barker_large-scale_2007, title = {A large-scale empirical comparison of object-oriented cohesion metrics}, isbn = {1530-1362}, doi = {10.1109/ASPEC.2007.49}, abstract = {Cohesion is an attribute of software design quality for which many metrics have been proposed. The different proposals have been made largely on theoretical grounds, with little evidence of actual use. This makes it difficult to provide advice to software developers as to how to interpret the measurements any given metric produces. This paper presents the first large-scale empirical study of object- oriented cohesion metrics. We apply 16 metrics from the literature, as well as a number of variations, to 92 open source and industry Java applications ranging in size from a few classes to several thousand, over 100,000 classes in all. Our results show that by and large applications have similar distributions of measurements according to any given metric, but that the distributions can be quite different across metrics. This provides useful information for the ongoing empirical validation efforts for cohesion metrics.}, booktitle = {14th {Asia-Pacific} Software Engineering Conference}, author = {Barker, Richard and Tempero, Ewan}, year = {2007}, keywords = {cohesion, comparative study, empirical, metrics, survey}, pages = {414--421}, annote = {A good overview of the various cohesion measures and representative values for real systems. One interesting result is that most of the metrics seem to have a bimodal distribution with most of the measurements being close to 0 or 1. Relevance: 4} }, @article{zager_graph_2008, title = {Graph similarity scoring and matching}, volume = {21}, issn = {0893-9659}, url = {http://www.sciencedirect.com/science/article/B6TY9-4N9P4KC-2/2/30c3c3bd9791c990f57381f81d4595db}, doi = {10.1016/j.aml.2007.01.006}, abstract = {We outline a class of graph similarity measures that uses the structural similarity of local neighborhoods to derive pairwise similarity scores for the nodes of two different graphs, and present a related similarity measure that uses a linear update to generate both node and edge similarity scores. This measure is then applied to the task of graph matching.}, number = {1}, journal = {Applied Mathematics Letters}, author = {Zager, Laura A. and Verghese, George C.}, month = jan, year = {2008}, keywords = {Graph matching, graphs, similarity}, pages = {86--94} }, @techreport{_standard_1990, title = {Standard Glossary of Software Engineering Terminology}, number = {{IEEE} 610.12-1990}, institution = {{IEEE}}, month = jan, year = {1990}, pages = {83} }, @inproceedings{marinescu_identification_2006, title = {Identification of design roles for the assessment of design quality in enterprise applications}, abstract = {The software industry is increasingly confronted with the issues of understanding and maintaining a special type of object-oriented systems, namely enterprise applications {(EA).} In the recent years many specific rules and patterns for the design of such applications were proposed. These new specific principles of {EA} design define precise roles (patterns) for classes and methods, and then describe {\textquotedblright}good-design{\textquotedblright} rules in terms of such roles. Yet, these roles are rarely explicitly documented; therefore, due to their importance for an efficient understanding and assessment of {EA} design, they must be identified and localized in the source code based on their specificities. In this paper we define a suite of techniques for the identification and location of four such roles, all related to the data source layer of an {EA.} Using the knowledge about these roles we show how this can improve the accuracy of formerly defined techniques for detecting two well-known design problems (i.e., Data Class and Feature Envy), making them more applicable for the usage on enterprise systems. Based on an experimental study conducted on three {EAs}, we prove the feasibility of the approach, discuss its benefits and touch the issues that need to be addressed in the future.}, booktitle = {Proc. {IEEE} International Conference on Program Comprehension}, author = {Marinescu, Cristina}, year = {2006}, keywords = {patterns, smells, software ecology} }, @inproceedings{forster_cost_2006, address = {New York, {NY}, {USA}}, series = {{PPPJ} '06}, title = {Cost and benefit of rigorous decoupling with context-specific interfaces}, isbn = {3-939352-05-5}, location = {Mannheim, Germany}, doi = {10.1145/1168054.1168059}, abstract = {In Java programs, classes are coupled to each other through the use of typed references. In order to minimize coupling without changing the executed code, interfaces can be introduced for every declaration element such that each interface contains only those members that are actually needed from the objects referenced by that element. While these interfaces can be automatically computed using type inference, concerns have been raised that rigorous application of this principle would increase the number of types in a program to levels beyond manageability. It should be clear that decoupling is required only in selected places and no one would seriously introduce a minimal interface for every declaration element in a program. Nevertheless we have investigated the actual cost of so doing (counted as the number of new types required) by applying rigorous decoupling to a number of open source Java projects, and contrasted it with the benefit, measured in terms of reduced overall coupling. Our results suggest that (a) fewer new interfaces are needed than one might believe and (b) that a small number of new interfaces accounts for a large number of declaration elements. Particularly the latter means that automated derivation of decoupling interfaces may at times be useful, if the number of new interfaces is limited a priori to the popular ones.}, booktitle = {Proceedings of the 4th international symposium on Principles and practice of programming in Java}, publisher = {{ACM}}, author = {Forster, Florian}, year = {2006}, note = {{ACM} {ID:} 1168059}, keywords = {coupling, design, interfaces, refactoring}, pages = {23{\textendash}30} }, @inproceedings{simon_metrics_2001, title = {Metrics Based Refactoring}, isbn = {0-7695-1028-0}, url = {http://portal.acm.org/citation.cfm?id=794203.795287}, abstract = {Refactoring is one key issue to increase internal software quality during the whole software lifecycle. Since identifying structures where refactorings should be applied often is explained with subjective perceptions like "bad taste" or "bad smell" an automatic refactoring location finder seems difficult. We show that a special kind of metrics can support these subjective perceptions and thus can be used as effective and efficient way to get support for the decision where to apply which refactoring. Due to the fact that the software developer is the last authority we provide powerful and metrics based software visualisation to support the developers judging their products. In this paper we demonstrate this approach for four typical refactorings and present both a tool supporting the identification and case studies of its application.}, booktitle = {Proceedings of the Fifth European Conference on Software Maintenance and Reengineering}, publisher = {{IEEE} Computer Society}, author = {Simon, Frank and Steinbr\"{u}ckner, Frank and Lewerentz, Claus}, year = {2001}, keywords = {cohesion, coupling, graph layout, graphs, metrics, refactoring, visualization}, pages = {30}, annote = {Discusses how some smells can be interpreted via metrics, primarily coupling and cohesion. The emphasis is on visualization. Relevance: 4 } }, @misc{_open_2011, title = {The Open Biological and Biomedical Ontologies}, url = {http://www.obofoundry.org/}, year = {2011}, note = {Accessed 2011-12-15}, howpublished = {http://www.obofoundry.org/} }, @article{brandes_variants_2008, title = {On variants of shortest-path betweenness centrality and their generic computation*}, volume = {30}, issn = {03788733}, url = {http://www.sciencedirect.com/science?_ob=ArticleURL&_udi=B6VD1-4RFJ4K1-1&_user=10&_coverDate=05%2F31%2F2008&_rdoc=1&_fmt=high&_orig=search&_sort=d&_docanchor=&view=c&_searchStrId=1330610130&_rerunOrigin=scholar.google&_acct=C000050221&_version=1&_urlVersion=0&_userid=10&md5=512865663f12a2d7553213af9953c06e}, doi = {10.1016/j.socnet.2007.11.001}, number = {2}, journal = {Social Networks}, author = {Brandes, U}, month = may, year = {2008}, keywords = {betweenness, graphs, {SNA}}, pages = {136--145} }, @techreport{li_object-oriented_1993-1, address = {Blacksburg, {VA}, {USA}}, type = {Technical Report}, title = {{Object-Oriented} Metrics Which Predict Maintainability}, abstract = {Software metrics have been studied in the procedural paradigm as a quantitative means of assessing the software development process as well as the quality of software products. Several studies have validated that various metrics are useful indicators of maintenance effort in the procedural paradigm. However, software metrics have rarely been studied in the object oriented paradigm. Very few metrics have been proposed to measure object oriented systems, and the proposed ones have not been validated. This research concentrates on several object oriented software metrics and the validation of these metrics with maintenance effort in two commercial systems. Statistical analyses of a prediction model incorporating ten metrics are performed. In addition, a more compact model with fewer metrics was sought, analyses performed, and also presented.}, number = {{TR} 93-05}, institution = {Virginia Polytechnic Institute \& State University}, author = {Li, Wei and Henry, Sallie M}, year = {1993}, pages = {35} }, @phdthesis{sager_coogle_2005, type = {Master's thesis}, title = {Coogle - A {Code-Google} Plugin for Eclipse}, url = {http://seal.ifi.uzh.ch/43/}, abstract = {In recent years, various attempts have been made to "measure" if two pieces of software from different packages or from subsequent releases of the same software package are similar or not. One might imagine a tool that ?magically" detects similar pieces of software stored in a huge software repository while the programmer implements new software. In that way, time is saved and software reusability is improved. Another application is software plagiarism detection where the principal task is to find (illegally) copied software. Determining software similarity also plays an important role in software evolution. When the task is to detect how software evolved over a certain period of time, software similarity is one source of information to get a better picture of the evolution process. Especially the approach of combining similarity measures and logical coupling measures seems to be very promising after all. {SimPack} is a library of similarity measures. It implements a set of similarity measures along with a set of data wrappers. The wrappers are used to apply the measures to a specific data format. The goal of this diploma thesis is to implement the {Coogle-System} {(Code-Google)} which serves as search engine for software source code. Coogle makes use of a large software repository and a bunch of similarity measures implemented in {SimPack} to perform similarity measurements.}, school = {University of Zuricj}, author = {Sager, Tobias}, month = dec, year = {2005}, keywords = {distance metric, Eclipse, plagiarism, similarity} }, @inproceedings{agrawal_fast_1994, address = {San Francisco, {CA}, {USA}}, series = {{VLDB} '94}, title = {Fast Algorithms for Mining Association Rules in Large Databases}, isbn = {1-55860-153-8}, url = {http://portal.acm.org/citation.cfm?id=645920.672836}, abstract = {Rakesh Agrawal No contact information provided yet. Bibliometrics:~publication history Publication years1983-2011 Publication count149 Citation Count11,166 Available for download67 Downloads (6 Weeks)1,010 Downloads (12 Months)8,061 View colleagues of Rakesh Agrawal ~Ramakrishnan Srikant No contact information provided yet. Bibliometrics:~publication history Publication years1994-2010 Publication count41 Citation Count5,633 Available for download22 Downloads (6 Weeks)396 Downloads (12 Months)3,307 View colleagues of Ramakrishnan Srikant}, booktitle = {Proceedings of the 20th International Conference on Very Large Data Bases}, publisher = {Morgan Kaufmann Publishers Inc.}, author = {Agrawal, Rakesh and Srikant, Ramakrishnan}, year = {1994}, note = {{ACM} {ID:} 672836}, pages = {487{\textendash}499} }, @article{jiawei_han_mining_1999, title = {Mining multiple-level association rules in large databases}, volume = {11}, issn = {1041-4347}, doi = {10.1109/69.806937}, abstract = {A top-down progressive deepening method is developed for efficient mining of multiple-level association rules from large transaction databases based on the a priori principle. A group of variant algorithms is proposed based on the ways of sharing intermediate results, with the relative performance tested and analyzed. The enforcement of different interestingness measurements to find more interesting rules, and the relaxation of rule conditions for finding {\textquotedblleft}level-crossing{\textquotedblright} association rules, are also investigated. The study shows that efficient algorithms can be developed from large databases for the discovery of interesting and strong multiple-level association rules}, number = {5}, journal = {{IEEE} Transactions on Knowledge and Data Engineering}, author = {Jiawei Han and Yongjian Fu}, year = {1999}, keywords = {data mining, interestingness measurements, large databases, transaction processing, very large databases}, pages = {798--805} }, @book{martin_clean_2008, title = {Clean Code: A Handbook of Agile Software Craftsmanship}, isbn = {0132350882}, shorttitle = {Clean Code}, url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20&path=ASIN/0132350882}, abstract = {Even bad code can function. But if code isn{\textquoteright}t clean, it can bring adevelopment organization to its knees. Every year, countless hours andsignificant resources are lost because of poorly written code. But it doesn{\textquoteright}thave to be that {way.Noted} software expert Robert C. Martin presents a revolutionary paradigm {with\_**Clean} Code: A Handbook of Agile Software Craftsmanship**\_. Martin hasteamed up with his colleagues from Object Mentor to distill their best agilepractice of cleaning code {\textquotedblleft}on the fly{\textquotedblright} into a book that will instill withinyou the values of a software craftsman and make you a better programmer{\textemdash}butonly if you work at {it.What} kind of work will you be doing? You{\textquoteright}ll be reading code{\textemdash}lots of code. Andyou will be challenged to think about what{\textquoteright}s right about that code, and what{\textquoteright}swrong with it. More importantly, you will be challenged to reassess yourprofessional values and your commitment to your {craft.\_**Clean} Code**\_ is divided into three parts. The first describes theprinciples, patterns, and practices of writing clean code. The second partconsists of several case studies of increasing complexity. Each case study isan exercise in cleaning up code{\textemdash}of transforming a code base that has someproblems into one that is sound and efficient. The third part is the payoff: asingle chapter containing a list of heuristics and {\textquotedblleft}smells{\textquotedblright} gathered whilecreating the case studies. The result is a knowledge base that describes theway we think when we write, read, and clean {code.Readers} will come away from this book understanding * How to tell the difference between good and bad code * How to write good code and how to transform bad code into good code * How to create good names, good functions, good objects, and good classes * How to format code for maximum readability * How to implement complete error handling without obscuring code logic * How to unit test and practice test-driven {developmentThis} book is a must for any developer, software engineer, project manager,team lead, or systems analyst with an interest in producing better code.}, publisher = {Prentice Hall {PTR}}, author = {Martin, Robert}, year = {2008}, keywords = {agile, design, maintainability, refactoring, smells} }, @inproceedings{poshyvanyk_conceptual_2006, title = {The conceptual coupling metrics for object-oriented systems}, abstract = {Coupling in software has been linked with maintainability and existing metrics are used as predictors of external software quality attributes such as fault-proneness, impact analysis, ripple effects of changes, changeability, etc. Many coupling measures for object-oriented {(OO)} software have been proposed, each of them capturing specific dimensions of coupling. This paper presents a new set of coupling measures for {OO} systems {\textendash} named conceptual coupling, based on the semantic information obtained from the source code, encoded in identifiers and comments. A case study on open source software systems is performed to compare the new measures with existing structural coupling measures. The case study shows that the conceptual coupling captures new dimensions of coupling, which are not captured by existing coupling measures; hence it can be used to complement the existing metrics.}, booktitle = {22nd {IEEE} International Conference on Software Maintenance, 2006. {ICSM'06}}, author = {Poshyvanyk, Denys and Marcus, Adrian}, year = {2006}, keywords = {cohesion, coupling, metrics}, pages = {469--478}, annote = {Includes a brief enumeration of various kinds of coupling measurements - structural coupling metrics, dynamic coupling measures [4], evolutionary and logical coupling [16, 38], coupling measures based on information entropy approach, ...} }, @article{hall_weka_2009, title = {The {WEKA} data mining software: An update}, volume = {11}, number = {1}, journal = {{ACM} {SIGKDD} Explorations Newsletter}, author = {Hall, M. and Frank, E. and Holmes, G. and Pfahringer, B. and Reutemann, P. and Witten, I. H.}, year = {2009}, keywords = {data mining, machine learning}, pages = {10--18} }, @phdthesis{gupta_critique_1997, type = {Master's thesis}, title = {A critique of cohesion measures in the object-oriented paradigm}, school = {Michigan Technological University}, author = {Gupta, B. S.}, year = {1997}, keywords = {cohesion, metrics} }, @incollection{mens_use_2006, title = {On the Use of Graph Transformations for Model Refactoring}, url = {http://dx.doi.org/10.1007/11877028_7}, abstract = {Model-driven software engineering promotes the use of models and transformations as primary artifacts. Several formalisms can be used for the specification of model transformations. We propose to represent models as graphs, and model transformations as graph transformations. In particular, we focus on the activity of model refactoring, and show how graph transformation theory can provide formal support for this activity. We also show how such support can be implemented in state-of-the-art graph transformation tools such as {AGG} and Fujaba, and provide two concrete experiments. Critical pair analysis in {AGG} enables the analysis of dependencies between model refactorings. The round-trip engineering facility of Fujaba enables the automatic generation of code for model refactorings.}, booktitle = {Generative and Transformational Techniques in Software Engineering}, author = {Mens, Tom}, year = {2006}, keywords = {graphs, semantic networks, transformation}, pages = {219--257} }, @book{booch_object-oriented_2007, title = {{Object-Oriented} Analysis and Design with Applications}, isbn = {9780132797443}, abstract = {This is the {eBook} version of the printed book. {Object-Oriented} Design with Applications has long been the essential reference to object-oriented technology, which, in turn, has evolved to join the mainstream of industrial-strength software development. In this third edition--the first revision in 13 years--readers can learn to apply object-oriented methods using new paradigms such as Java, the Unified Modeling Language {(UML)} 2.0, and {.NET.The} authors draw upon their rich and varied experience to offer improved methods for object development and numerous examples that tackle the complex problems faced by software engineers, including systems architecture, data acquisition, cryptoanalysis, control systems, and Web development. They illustrate essential concepts, explain the method, and show successful applications in a variety of fields. You'll also find pragmatic advice on a host of issues, including classification, implementation strategies, and cost-effective project {management.New} to this new edition {areAn} introduction to the new {UML} 2.0, from the notation's most fundamental and advanced elements with an emphasis on key {changesNew} domains and {contextsA} greatly enhanced focus on modeling--as eagerly requested by readers--with five chapters that each delve into one phase of the overall development {lifecycle.Fresh} approaches to reasoning about complex {systemsAn} examination of the conceptual foundation of the widely misunderstood fundamental elements of the object model, such as abstraction, encapsulation, modularity, and {hierarchyHow} to allocate the resources of a team of developers and mange the risks associated with developing complex software {systemsAn} appendix on object-oriented programming {languagesThis} is the seminal text for anyone who wishes to use object-oriented technology to manage the complexity inherent in many kinds of systems. Sidebars~~ Preface~ Acknowledgments ~~ About the Authors ~~ Section I: Concepts~~ Chapter 1: Complexity~~~ Chapter 2: The Object Model ~~ Chapter 3: Classes and Objects ~~ Chapter 4: Classification ~~ Section {II:} Method ~ Chapter 5: Notation ~~ Chapter 6: Process Chapter 7: Pragmatics~~~ Chapter 8: System Architecture: {Satellite-Based} Navigation ~~ Chapter 9: Control System: Traffic Management ~~ Chapter 10: Artificial Intelligence: Cryptanalysis ~~ Chapter 11: Data Acquisition: Weather Monitoring Station ~ Chapter 12: Web Application: Vacation Tracking System ~~~ Appendix A: {Object-Oriented} Programming Languages~ Appendix B: Further Reading ~~ Notes ~~ Glossary ~~ Classified Bibliography ~~ Index}, publisher = {{Addison-Wesley}}, author = {Booch, Grady and Maksimchuk, Robert A. and Engel, Michael W. and Conallen, Jim and Houston, Kelli A. and Young, Bobbi J.}, month = apr, year = {2007} }, @inproceedings{vaucher_tracking_2009, address = {Lille, France}, title = {Tracking Design Smells: Lessons from a Study of God Classes}, shorttitle = {Tracking Design Smells}, url = {http://www.computer.org/portal/web/csdl/doi/10.1109/WCRE.2009.23}, doi = {10.1109/WCRE.2009.23}, abstract = {{{\textquotedblleft}God} class{\textquotedblright} is a term used to describe a certain type of large classes which {\textquotedblleft}know too much or do too much{\textquotedblright}. Often a God class {(GC)} is created by accident as functionalities are incrementally added to a central class over the course of its evolution. {GCs} are generally thought to be examples of bad code that should be detected and removed to ensure software quality. However, in some cases, a {GC} is created by design as the best solution to a particular problem because, for example, the problem is not easily decomposable or strong requirements on efficiency exist. In this paper, we study in two open-source systems the {\textquotedblleft}life cycle{\textquotedblright} of {GCs:} how they arise, how prevalent they are, and whether they remain or they are removed as the systems evolve over time, through a number of versions. We show how to detect the degree of {\textquotedblleft}godliness{\textquotedblright} of classes automatically. Then, we show that by identifying the evolution of {\textquotedblleft}godliness{\textquotedblright}, we can distinguish between those classes that are so by design (good code) from those that occurred by accident (bad code). This methodology can guide software quality teams in their efforts to implement prevention and correction mechanisms.}, booktitle = {2009 16th Working Conference on Reverse Engineering}, author = {Vaucher, Stephane and Khomh, Foutse and Moha, Naouel and Gueheneuc, {Yann-Gael}}, month = oct, year = {2009}, keywords = {size, smells}, pages = {145--154} }, @techreport{eder_coupling_1994, address = {Univ. of Klagenfurt}, type = {Technical Report}, title = {Coupling and cohesion in object-oriented systems}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.55.5819}, institution = {Institut fur Informatik}, author = {Eder, Johann and Kappel, Gerti and Schrefl, Michael}, year = {1994}, keywords = {cohesion, coupling, metrics}, annote = {One of the earliest papers on {OO} coupling and cohesion. Classifies seven different types of cohesion - temporal, procedural, etc. Relevance: 3} }, @book{freeman_head_2004, title = {Head First design patterns}, isbn = {9780596007126}, publisher = {{O'Reilly} Media, Inc.}, author = {Freeman, Eric and Freeman, Elisabeth and Sierra, Kathy and Bates, Bert}, year = {2004}, keywords = {design patterns}, annote = {Mentions "one class, one responsibility"} }, @article{zhou_comparative_2004, title = {A comparative study of graph theory-based class cohesion measures}, volume = {29}, url = {http://portal.acm.org/citation.cfm?id=979767}, doi = {10.1145/979743.979767}, abstract = {Among a large number of cohesion measures for classes proposed in last decade, many measures abstract a class by an undirected or directed graph, in which the nodes represent the class members and the edges represent the relationships among these members. This paper compares six typical graph theory-based cohesion measures for classes, and states what problems should be addressed during the development of new cohesion measures.}, number = {2}, journal = {{SIGSOFT} Softw. Eng. Notes}, author = {Zhou, Yuming and Lu, Jiangtao and Lu, Hongmin and Xu, Baowen}, year = {2004}, keywords = {cohesion, graphs, metrics}, pages = {13--13}, annote = {Somewhat difficult to read, but it gives a good discussion of graph-based cohesion measures. } }, @phdthesis{noack_unified_2007, address = {Cottbus, Germany}, type = {{PhD} thesis}, title = {Unified quality measures for clusterings, layouts, and orderings of graphs, and their application as software design criteria}, abstract = {How good is a given graph clustering, graph layout, or graph ordering {\textendash} specifically, how well does it group densely connected vertices and separate sparsely connected vertices? How good is a given software design {\textendash} specifically, how well does it minimize the interdependence of the subsystems? This work introduces and validates simple and uniform measures for these two properties. Together with existing optimization algorithms, the introduced measures enable the automatic computation e.g. of communities in social networks and of design flaws in software systems. The first part derives, validates, and unifies quality measures for graph clusterings, graph layouts, and graph orderings, with the following results: {\textbullet} Identical quality measures can be applied to clusterings, layouts, and orderings; this enables the computation of consistent clusterings, layouts, and orderings. {\textbullet} Diverse existing and new measures can be unified into few general measures; this facilitates their comparison and validation. {\textbullet} Many existing measures are biased towards certain clusterings, layouts, or orderings, even for graphs without particularly dense or sparse subgraphs, and thus do not (only) measure quality in the above sense. {\textbullet} For example graphs, the minimization of new, unbiased (or weakly biased) measures reveals nonobvious groups, e.g. communities in social networks, subject areas in hypertexts, or closely interlocked countries in international trade. The second part derives, validates, and unifies dependency-based indicators of software design quality. It applies two quality measures for graph clusterings as measures for the coupling of software subsystems {\textendash} specifically for the coupling indicated by common changes and for the coupling indicated by references {\textendash} and shows: {\textbullet} The measures quantify the dependency-caused development costs, under welldefined simplifying assumptions. {\textbullet} The minimization of the measures conforms to existing dependency-related design principles (like locality of change, acyclicity of references, and stability of references), design rules, and design patterns. {\textbullet} For example software systems, the incremental minimization of the measures reveals nonobvious design flaws, like the distribution of coherent responsibilities over several subsystems, or references from low-level to high-level subsystems. In summary, this work shows that {\textbullet} simple measures can suffice to capture important aspects of graph clustering quality, graph layout quality, graph ordering quality, and software design quality, and {\textbullet} the optimization of simple measures can suffice to detect nonobvious and often useful structure in various real-world systems}, school = {Brandenburg University of Technology}, author = {Noack, A.}, year = {2007}, keywords = {graph layout, graphs, metrics, thesis} }, @article{coleman_using_1994, title = {Using metrics to evaluate software system maintainability}, volume = {27}, issn = {0018-9162}, doi = {10.1109/2.303623}, abstract = {Software metrics have been much criticized in the last few years, sometimes justly but more often unjustly, because critics misunderstand the intent behind the technology. Software complexity metrics, for example, rarely measure the "inherent complexity" embedded in software systems, but they do a very good job of comparing the relative complexity of one portion of a system with another. In essence, they are good modeling tools. Whether they are also good measuring tools depends on how consistently and appropriately they are applied}, number = {8}, journal = {Computer}, author = {Coleman, Don and Ash, Dan and Lowther, Bruce and Oman, Paul}, year = {1994}, keywords = {complexity, maintainability, maintenance, metrics}, pages = {44--49}, annote = {This paper contains some numbers pertaining to maintenance costs and talks about maintainability indices, particularly those built using polynomial combinations of other metrics.} }, @inproceedings{jarzabek_genericity_2006, address = {Washington, {DC}, {USA}}, title = {Genericity - a {"Missing} in Action" Key to Software Simplification and Reuse}, isbn = {0-7695-2685-3}, doi = {http://dx.doi.org/10.1109/APSEC.2006.37}, booktitle = {{APSEC} '06: Proceedings of the {XIII} Asia Pacific Software Engineering Conference}, publisher = {{IEEE} Computer Society}, author = {Jarzabek, Stan}, year = {2006}, keywords = {clones, refactoring}, pages = {293{\textemdash}300}, annote = {Gives some good background on similarity, how it can be removed, but also some on why it can't or shouldn't be removed. He then discusses his "generative meta-programming technique." Relevance: 3 } }, @inproceedings{kovacs_cluster_2005, title = {Cluster validity measurement techniques}, booktitle = {6th International Symposium of Hungarian Researchers on Computational Intelligence}, author = {Kov\'{a}cs, F. and Leg\'{a}ny, C. and Babos, A.}, year = {2005}, keywords = {clustering, validation} }, @article{czibula_hierarchical_2008, title = {Hierarchical Clustering Based Design Patterns Identification}, volume = {Volume {III}}, issn = {1841-9836}, abstract = {Design patterns have attracted significant attention in software engineering in thelast period. An important reason behind this is that design patterns are potentially useful inboth development of new, and comprehension of existing object-oriented design, especiallyfor large legacy systems without sufficient documentation. That is why the problem of designpatterns identification is very important. Automating the detection of design pattern instancescould be of significant help to the process of reverse engineering large software systems. Inthis paper we aim at introducing a search based approach for identifying instances of designpatterns in a software system. An experimental evaluation of our approach is also provided.}, number = {Suppl. issue: Proceedings of {ICCCC} 2008}, journal = {International Journal of Computers, Communication and Control}, author = {Czibula, {Istv\'{a}n-Gergely} and Serban, Gabriela}, year = {2008}, keywords = {clustering, patterns, software clustering}, pages = {248--252} }, @article{al_dallal_object-oriented_2010, title = {An object-oriented high-level design-based class cohesion metric}, volume = {52}, issn = {0950-5849}, url = {http://www.sciencedirect.com/science/article/pii/S0950584910001552}, doi = {10.1016/j.infsof.2010.08.006}, abstract = {Context Class cohesion is an important object-oriented software quality attribute. Assessing class cohesion during the object-oriented design phase is one important way to obtain more comprehensible and maintainable software. In practice, assessing and controlling cohesion in large systems implies measuring it automatically. One issue with the few existing cohesion metrics targeted at the high-level design phase is that they are not based on realistic assumptions and do not fulfill expected mathematical properties. Objective This paper proposes a {High-Level} Design {(HLD)} class cohesion metric, which is based on realistic assumptions, complies with expected mathematical properties, and can be used to automatically assess design quality at early stages using {UML} diagrams. Method The notion of similarity between pairs of methods and pairs of attribute types in a class is introduced and used as a basis to introduce a novel high-level design class cohesion metric. The metric considers method{\textendash}method, attribute{\textendash}attribute, and attribute{\textendash}method direct and transitive interactions. We validate this Similarity-based Class Cohesion {(SCC)} metric theoretically and empirically. The former includes a careful study of the mathematical properties of the metric whereas the latter investigates, using four open source software systems and 10 cohesion metrics, whether {SCC} is based on realistic assumptions and whether it better explains the presence of faults, from a statistical standpoint, than other comparable cohesion metrics, considered individually or in combination. Results Results confirm that {SCC} is based on clearly justified theoretical principles, relies on realistic assumptions, and is an early indicator of quality (fault occurrences). Conclusion It is concluded that {SCC} is both theoretically valid and supported by empirical evidence. It is a better alternative to measure class cohesion than existing {HLD} class cohesion metrics.}, number = {12}, journal = {Information and Software Technology}, author = {Al Dallal, Jehad and Briand, Lionel C.}, month = dec, year = {2010}, keywords = {cohesion, Fault prediction, {OOD}}, pages = {1346--1361} }, @inproceedings{telea_extraction_2009, title = {Extraction and Visualization of Call Dependencies for Large {C/C++} Code Bases: A Comparative Study}, abstract = {Investigating program dependencies such as function calls is challenging for very large systems. We present here an integrated pipeline for extraction and visualization of call-and-hierarchy graphs for {C/C++} programs. We present several adaptions and enhancements of a recent visualization method for large call graphs and compare its effectiveness with classical node-link diagrams. Examples are given on large real-world code bases such as bison, Mozilla and oink.}, booktitle = {Proc. {VISSOFT}}, author = {Telea, A. and Hoogendorp, H. and Ersoy, O. and Reniers, D. and {SolidSource}, B. V.}, year = {2009}, keywords = {call graph, graphs, visualization} }, @inproceedings{kirk_using_2005, title = {Using Attribute Slicing to Refactor Large Classes}, booktitle = {Beyond Program Slicing}, author = {Kirk, Douglas and Roper, Marc and Walkinshaw, Neil}, year = {2005}, keywords = {refactoring, restructuring, slicing} }, @article{kleinberg_authoritative_1999, title = {Authoritative sources in a hyperlinked environment}, volume = {46}, url = {http://portal.acm.org/citation.cfm?id=324140}, doi = {10.1145/324133.324140}, abstract = {The network structure of a hyperlinked environment can be a rich source of information about the content of the environment, provided we have effective means for understanding it. We develop a set of algorithmic tools for extracting information from the link structures of such environments, and report on experiments that demonstrate their effectiveness in a variety of context on the World Wide Web. The central issue we address within our framework is the distillation of broad search topics, through the discovery of {\textquotedblleft}authorative{\textquotedblright} information sources on such topics. We propose and test an algorithmic formulation of the notion of authority, based on the relationship between a set of relevant authoritative pages and the set of {\textquotedblleft}hub pages{\textquotedblright} that join them together in the link structure. Our formulation has connections to the eigenvectors of certain matrices associated with the link graph; these connections in turn motivate additional heuristrics for link-based analysis.}, number = {5}, journal = {J. {ACM}}, author = {Kleinberg, Jon M.}, year = {1999}, keywords = {graphs, link analysis, world wide web}, pages = {604--632} }, @article{hanna_maintenance_1993, title = {Maintenance burden begging for remedy}, volume = {13}, number = {6}, journal = {Software Magazine}, author = {Hanna, Mary}, month = apr, year = {1993}, keywords = {maintenance}, pages = {53--53}, annote = {Reports maintenance costs from 43-44\% for 1992-1993. Relevance: 2} }, @incollection{washizaki_automated_2003, title = {Automated Extract Component Refactoring}, url = {http://dx.doi.org/10.1007/3-540-44870-5_42}, abstract = {We propose a new refactoring {{\textquotedblleft}Extract} Component{\textquotedblright} to support the organizational reuse of components and improve the productivity under Agile methods. Our refactoring can extract reusable components composed of classes from object-oriented programs, and modify the surrounding parts of extracted components in original programs. We have developed a tool that performs our refactoring automatically.}, booktitle = {Extreme Programming and Agile Processes in Software Engineering}, author = {Washizaki, Hironori and Fukazawa, Yoshiaki}, year = {2003}, keywords = {component architectures, refactoring}, pages = {1016} }, @inproceedings{rousidis_clustering_2005, title = {Clustering Data Retrieved from Java Source Code to Support Software Maintenance: A Case Study}, isbn = {0-7695-2304-8}, shorttitle = {Clustering Data Retrieved from Java Source Code to Support Software Maintenance}, url = {http://portal.acm.org/citation.cfm?id=1049037}, abstract = {Data mining is a technology recently used in support of software maintenance in various contexts. Our works focuses on achieving a high level understanding of Java systems without prior familiarity with these. Our thesis is that system structure and interrelationships, as well as similarities among program components can be derived by applying cluster analysis on data extracted from source code. This paper proposes a methodology suitable for Java code analysis. It comprises of a Java code analyser which examines programs and constructs tables representing code syntax, and a clustering engine which operates on such tables and identifies relationships among code elements. We evaluate the methodology on a medium sized system, present initial results and discuss directions for further work.}, booktitle = {Proceedings of the Ninth European Conference on Software Maintenance and Reengineering}, publisher = {{IEEE} Computer Society}, author = {Rousidis, Dimitris and Tjortjis, Christos}, year = {2005}, keywords = {clustering, reverse engineering}, pages = {276--279}, annote = {Lightweight. Groupings of 15 methods are shown to correspond with groupings created by people. They also show the schema they based their clustering on. Relevance: 3} }, @inproceedings{pich_visual_2008, address = {Ammersee, Germany}, title = {Visual analysis of importance and grouping in software dependency graphs}, isbn = {978-1-60558-112-5}, url = {http://portal.acm.org/citation.cfm?id=1409720.1409725&coll=portal&dl=ACM&type=series&idx=SERIES10774&part=series&WantType=Proceedings&title=SV&CFID=86337823&CFTOKEN=19567468}, doi = {10.1145/1409720.1409725}, abstract = {Understanding dependencies between components is a key task in software engineering. We present a method for the display and visual analysis of dependency graphs occurring in large software systems. Our layout approach takes into account similarity and importance of the system components and additional grouping information; using efficient algorithms based on linear algebra, it scales to very large dependency graphs. We apply our method to two real-world software systems and present the results.}, booktitle = {Proceedings of the 4th {ACM} symposium on Software visualization}, publisher = {{ACM}}, author = {Pich, Christian and Nachmanson, Lev and Robertson, George G.}, year = {2008}, keywords = {analysis, clustering, graph layout, graphs, multidimensional scaling, visualization}, pages = {29--32} }, @inproceedings{bavota_two-step_2010, address = {Antwerp, Belgium}, title = {A two-step technique for extract class refactoring}, isbn = {978-1-4503-0116-9}, url = {http://portal.acm.org/citation.cfm?id=1858996.1859024}, doi = {10.1145/1858996.1859024}, abstract = {We propose a novel approach supporting the Extract Class refactoring. The proposed approach analyzes the (structural and semantic) similarity of the methods in a class in order to identify chains of strongly related methods. The identified method chains are used to define new classes with higher cohesion than the original class. A preliminary evaluation reveals that the approach is able to identify meaningful refactoring operations.}, booktitle = {Proceedings of the {IEEE/ACM} International Conference on Automated Software Engineering}, publisher = {{ACM}}, author = {Bavota, Gabriele and De Lucia, Andrea and Marcus, Andrian and Oliveto, Rocco}, year = {2010}, keywords = {cohesion, coupling, empirical, extract class, semantics}, pages = {151--154}, annote = {The cohesion/similarity measure combines weighted structural and semantic terms. One of the structural terms is a Jaccard measurement relating methods to attributes. Another relates to the number of calls between methods. They state that when a method has only a single incoming call that those two methods should not be split. I disagree strongly. They mention that another study found that the structural and semantic cohesion metrics did not correlate. This seems a bad thing. I have my doubts about this result. I'll look closer. The algorithm works in two steps - (1) a weighted graph is built which is then split into pieces based on links with low cohesion (similarity) weights, (2) the little chains are joined with the longer ones. Joining of the chains is done using the same cohesion metric, only the distances are computed using the average of the pairwise distances between the chains. I think it would be better to have two distinctly different modes of operating. Structural relationships have primacy, and semantics are secondary. Relevance: 5 } }, @article{tarjan_depth-first_1972, title = {{Depth-First} Search and Linear Graph Algorithms}, volume = {1}, issn = {00975397}, url = {http://link.aip.org/link/SMJCAT/v1/i2/p146/s1&Agg=doi}, doi = {10.1137/0201010}, number = {2}, journal = {{SIAM} Journal on Computing}, author = {Tarjan, Robert}, year = {1972}, keywords = {clustering, graph algorithms, graphs}, pages = {146} }, @article{poshyvanyk_using_2009, title = {Using information retrieval based coupling measures for impact analysis}, volume = {14}, issn = {1382-3256}, url = {http://www.springerlink.com/content/36552251767q5646/}, doi = {10.1007/s10664-008-9088-2}, number = {1}, journal = {Empirical Software Engineering}, author = {Poshyvanyk, Denys and Marcus, Andrian and Ferenc, Rudolf and Gyim\'{o}thy, Tibor}, month = feb, year = {2009}, keywords = {coupling, information retrieval, metrics}, pages = {5--32} }, @article{huson_dendroscope:_2007, title = {Dendroscope: An interactive viewer for large phylogenetic trees}, volume = {8}, issn = {1471-2105}, shorttitle = {Dendroscope}, url = {http://www.biomedcentral.com/1471-2105/8/460}, doi = {10.1186/1471-2105-8-460}, abstract = {{BACKGROUND:Research} in evolution requires software for visualizing and editing phylogenetic trees, for increasingly very large datasets, such as arise in expression analysis or metagenomics, for example. It would be desirable to have a program that provides these services in an effcient and user-friendly way, and that can be easily installed and run on all major operating systems. Although a large number of tree visualization tools are freely available, some as a part of more comprehensive analysis packages, all have drawbacks in one or more domains. They either lack some of the standard tree visualization techniques or basic graphics and editing features, or they are restricted to small trees containing only tens of thousands of taxa. Moreover, many programs are diffcult to install or are not available for all common operating {systems.RESULTS:We} have developed a new program, Dendroscope, for the interactive visualization and navigation of phylogenetic trees. The program provides all standard tree visualizations and is optimized to run interactively on trees containing hundreds of thousands of taxa. The program provides tree editing and graphics export capabilities. To support the inspection of large trees, Dendroscope offers a magnification tool. The software is written in Java 1.4 and installers are provided for {Linux/Unix}, {MacOS} X and Windows {XP.CONCLUSION:Dendroscope} is a user-friendly program for visualizing and navigating phylogenetic trees, for both small and large datasets.}, number = {1}, journal = {{BMC} Bioinformatics}, author = {Huson, Daniel and Richter, Daniel and Rausch, Christian and Dezulian, Tobias and Franz, Markus and Rupp, Regula}, year = {2007}, pages = {460} }, @book{lanza_object-oriented_2006, title = {{Object-Oriented} Metrics in Practice}, isbn = {3540244298}, publisher = {{Springer-Verlag} New York, Inc.}, author = {Lanza, Michele and Marinescu, Radu}, year = {2006}, keywords = {empirical, metrics, smells}, annote = {Contains some useful info on statistical metric values. It also talks about detection of problems (e.g. god classes) using combinations of metric values. Available on Google Books at {http://books.google.co.nz/books?hl=en\&lr=\&id=gdLbgnaMaa0C\&oi=fnd\&pg=PA1\&ots=sZvMxRIorR\&sig=xxwjQ4nMXJphZGpNT2wetMFFTDI\#v=onepage\&q\&f=false} Statistics-based thresholds (pg 14): {NOM}, {LOC}, {CYCLO} Low = avg - stdev, high = avg + stdev; very high = (avg + stdev)*1.5 For Java: {(Low}, Ave., High, Very High) {CYCLO/LOC:} 0.16, 0.2, 0.24, 0.36 {LOC/Method:} 7, 10, 13, 19.5 {NOM/Class:} 4, 7, 10, 15 {WMC:} 5, 14, 31, 47 {AMW:} 1.1, 2.0, 3.1, 4.7 {LOC/Class:} 28, 70, 130, 195 {CYCLO} {(McCabe)} Their definition (== {CKs94} defn) of {WMC} {(Weighted} Method Count): "the sum of the {CYCLO} metric over all methods of a class." {AMW} - The average statical complexity of all methods in a class. {McCabe{\textquoteright}s} cyclomatic number is used to quantify the method{\textquoteright}s complexity {WMC} = {CYCLO/LOC} * {LOC/method} * {NOM/class} The god class detection strategy (pg 80): Class uses directly more than a few attributes of other classes {(ATFD} {\textgreater} few) and functional complexity of the class is very high {(WMC} {\textgreater}= very high) and class cohesion is low {(TCC} {\textless} .33). In discussing the break-up of god classes - "a first approach is to identify clusters of methods and attributes and to extract these islands into separate classes." Also defines a Brain Class disharmony. The main thing here is the presence of a Brain Method,which needs to be dealt with. Because we are not breaking up methods, we'll ignore this. } }, @article{van_mechelen_two-mode_2004, title = {Two-mode clustering methods: a structured overview}, volume = {13}, shorttitle = {Two-mode clustering methods}, url = {http://smm.sagepub.com/content/13/5/363.abstract}, doi = {10.1191/0962280204sm373ra}, abstract = {In this paper we present a structured overview of methods for two-mode clustering, that is, methods that provide a simultaneous clustering of the rows and columns of a rectangular data matrix. Key structuring principles include the nature of row, column and data clusters and the type of model structure or associated loss function. We illustrate with analyses of symptom data on archetypal psychiatric patients.}, number = {5}, journal = {Statistical Methods in Medical Research}, author = {Van Mechelen, Iven and Bock, {Hans-Hermann} and De Boeck, Paul}, month = oct, year = {2004}, pages = {363 --394} }, @article{cilibrasi_google_2007, title = {The Google similarity distance}, volume = {19}, url = {http://arxiv.org/abs/cs/0412098}, abstract = {Words and phrases acquire meaning from the way they are used in society, from their relative semantics to other words and phrases. For computers the equivalent of `society' is `database,' and the equivalent of `use' is `way to search the database.' We present a new theory of similarity between words and phrases based on information distance and Kolmogorov complexity. To fix thoughts we use the world-wide-web as database, and Google as search engine. The method is also applicable to other search engines and databases. This theory is then applied to construct a method to automatically extract similarity, the Google similarity distance, of words and phrases from the world-wide-web using Google page counts. The world-wide-web is the largest database on earth, and the context information entered by millions of independent users averages out to provide automatic semantics of useful quality. We give applications in hierarchical clustering, classification, and language translation. We give examples to distinguish between colors and numbers, cluster names of paintings by 17th century Dutch masters and names of books by English novelists, the ability to understand emergencies, and primes, and we demonstrate the ability to do a simple automatic {English-Spanish} translation. Finally, we use the {WordNet} database as an objective baseline against which to judge the performance of our method. We conduct a massive randomized trial in binary classification using support vector machines to learn categories based on our Google distance, resulting in an a mean agreement of 87\% with the expert crafted {WordNet} categories.}, number = {3}, journal = {{IEEE} Trans. Knowledge and Data Engineering}, author = {Cilibrasi, Rudi and Vitanyi, Paul M. B}, month = mar, year = {2007}, keywords = {clustering, search, similarity}, pages = {370--383} }, @article{binkley_empirical_2008, title = {An empirical study of the relationship between the concepts expressed in source code and dependence}, volume = {81}, issn = {01641212}, url = {http://linkinghub.elsevier.com/retrieve/pii/S0164121208000824}, doi = {10.1016/j.jss.2008.04.007}, abstract = {Programs express domain-level concepts in their source code. It might be expected that such concepts would have a degree of semantic cohesion. This cohesion ought to manifest itself in the dependence between statements all of which contribute to the computation of the same concept. This paper addresses a set of research questions that capture this informal observation. It presents the results of experiments on 10 programs that explore the relationship between domain-level concepts and dependence in source code. The results show that code associated with concepts has a greater degree of coherence, with tighter dependence. This finding has positive implications for the analysis of concepts as it provides an approach to decompose a program into smaller executable units, each of which captures the behaviour of the program with respect to a domain-level concept.}, number = {12}, journal = {Journal of Systems and Software}, author = {Binkley, D and Gold, N and Harman, M and Li, Z and Mahdavi, K}, month = dec, year = {2008}, keywords = {cohesion, semantics}, pages = {2287--2298} }, @book{witten_data_2011, title = {Data Mining: Practical Machine Learning Tools and Techniques}, isbn = {9780123748560}, shorttitle = {Data Mining}, publisher = {Elsevier Science \& Technology}, author = {Witten, Ian H. and Frank, Eibe and Hall, Mark A.}, month = jan, year = {2011} }, @article{dong_review_2008, title = {A Review of Design Pattern Mining Techniques}, abstract = {The quality of a software system highly depends on its architectural design. High quality software systems typically apply expert design experience which has been captured as design patterns. As demonstrated solutions to recurring problems, design patterns help to reuse expert experience in software system design. They have been extensively applied in industry. Mining the instances of design patterns from the source code of software systems can assist the understanding of the systems and the process of re-engineering them. More importantly, it also helps to trace back to the original design decisions, which are typically missing in legacy systems. This paper presents a review on current techniques and tools for mining design patterns from source code or design of software systems. We classify different approaches and analyze their results in a comparative study. We also examine the disparity of the discovery results from different approaches and analyze possible reasons with some insight.}, author = {Dong, Jing and Zhao, Yajing and Peng, Tu}, year = {2008}, keywords = {design pattern mining, design patterns, reengineering, reverse engineering, survey} }, @inproceedings{snelting_reengineering_1998, address = {Lake Buena Vista, Florida, United States}, title = {Reengineering class hierarchies using concept analysis}, isbn = {1-58113-108-9}, url = {http://portal.acm.org/citation.cfm?id=288273}, doi = {10.1145/288195.288273}, abstract = {The design of a class hierarchy may be imperfect. For example, a class C may contain a member m not accessed in any C-instance, an indication that m could be eliminated, or moved into a derived class. Furthermore, different subsets of C's members may be accessed from different C-instances, indicating that it might be appropriate to split C into multiple classes. We present a framework for detecting and remediating such design problems, which is based on concept analysis. Our method analyzes a class hierarchy along with a set of applications that use it, and constructs a lattice that provides valuable insights into the usage of the class hierarchy in a specific context. We show how a restructured class hierarchy can be generated from the lattice, and how the lattice can serve as a formal basis for interactive tools for redesigning and restructuring class hierarchies.}, booktitle = {Proceedings of the 6th {ACM} {SIGSOFT} International Symposium on Foundations of Software Engineering}, publisher = {{ACM}}, author = {Snelting, Gregor and Tip, Frank}, year = {1998}, keywords = {{FCA}, inheritance, reengineering, refactoring, restructuring}, pages = {99--110} }, @article{emam_optimal_2002, title = {The Optimal Class Size for {Object-Oriented} Software}, volume = {28}, url = {http://portal.acm.org/citation.cfm?id=567181}, abstract = {A growing body of literature suggests that there is an optimal size for software components. This means that components that are too small or too big will have a higher defect content (i.e., there is a U-shaped curve relating defect content to size). The U-shaped curve has become known as the Goldilocks Conjecture. Recently, a cognitive theory has been proposed to explain this phenomenon and it has been expanded to characterize object-oriented software. This conjecture has wide implications for software engineering practice. It suggests 1) that designers should deliberately strive to design classes that are of the optimal size, 2) that program decomposition is harmful, and 3) that there exists a maximum (threshold) class size that should not be exceeded to ensure fewer faults in the software. The purpose of the current paper is to evaluate this conjecture for object-oriented systems. We first demonstrate that the claims of an optimal component/class size (1) above) and of smaller components/classes having a greater defect content (2) above) are due to a mathematical artifact in the analyses performed previously. We then empirically test the threshold effect claims of this conjecture (3) above). To our knowledge, the empirical test of size threshold effects for object-oriented systems has not been performed thus far. We performed an initial study with an industrial C++ system and repeated it twice on another C++ system and on a commercial Java application. Our results provide unambiguous evidence that there is no threshold effect of class size. We obtained the same result for three systems using four different size measures. These findings suggest that there is a simple continuous relationship between class size and faults, and that, optimal class size, smaller classes are better and threshold effects conjectures have no sound theoretical nor empirical basis.}, number = {5}, journal = {{IEEE} Trans. Softw. Eng.}, author = {Emam, Khaled El and Benlarbi, Sa\"{i}da and Goel, Nishith and Melo, Walcelio and Lounis, Hakim and Rai, Shesh N.}, year = {2002}, keywords = {empirical, metrics, size}, pages = {494--509} }, @inproceedings{han_mining_2000, address = {New York, {NY}, {USA}}, title = {Mining frequent patterns without candidate generation}, volume = {29}, doi = {10.1145/335191.335372}, abstract = {Mining frequent patterns in transaction databases, time-series databases, and many other kinds of databases has been studied popularly in data mining research. Most of the previous studies adopt an Apriori-like candidate set generation-and-test approach. However, candidate set generation is still costly, especially when there exist prolific patterns and/or long patterns. In this study, we propose a novel frequent pattern tree {(FP-tree)} structure, which is an extended prefix-tree structure for storing compressed, crucial information about frequent patterns, and develop an efficient {FP-tree-based} mining method, {FP-growth}, for mining the complete set of frequent patterns by pattern fragment growth. Efficiency of mining is achieved with three techniques: (1) a large database is compressed into a highly condensed, much smaller data structure, which avoids costly, repeated database scans, (2) our {FP-tree-based} mining adopts a pattern fragment growth method to avoid the costly generation of a large number of candidate sets, and (3) a partitioning-based, divide-and-conquer method is used to decompose the mining task into a set of smaller tasks for mining confined patterns in conditional databases, which dramatically reduces the search space. Our performance study shows that the {FP-growth} method is efficient and scalable for mining both long and short frequent patterns, and is about an order of magnitude faster than the Apriori algorithm and also faster than some recently reported new frequent pattern mining methods.}, booktitle = {{ACM} {SIGMOD} Record}, publisher = {{ACM}}, author = {Han, Jiawei and Pei, Jian and Yin, Yiwen}, month = may, year = {2000}, keywords = {data mining}, pages = {1{\textendash}12} }, @phdthesis{marcus_semantic-driven_2003, title = {Semantic-driven program analysis}, url = {http://portal.acm.org/citation.cfm?id=979268}, abstract = {The tasks of maintenance and reengineering of an existing software system require a great deal of effort to be spent on understanding the source code to determine the behavior, organization, and architecture of the software. Different types of information (e.g., static, dynamic, source code, documentation, etc.) will describe different features of the software system. There are at least two key aspects of the system that the user needs to understand: (1) what problem is the software solving and (2) how is the software achieving the solution.}, school = {Kent State University}, author = {Marcus, Andrian}, year = {2003} }, @book{laplante_antipatterns:_2005, title = {{AntiPatterns:} identification, refactoring, and management}, isbn = {9780849329944}, shorttitle = {{AntiPatterns}}, publisher = {{CRC} Press}, author = {Laplante, Phillip A. and Neill, Colin J.}, month = dec, year = {2005}, keywords = {patterns, refactoring}, annote = {Uses "refactor relentlessly"} }, @article{fowlkes_method_1983, title = {A Method for Comparing Two Hierarchical Clusterings}, volume = {78}, issn = {0162-1459}, url = {http://www.jstor.org/stable/2288117}, doi = {10.2307/2288117}, abstract = {This article concerns the derivation and use of a measure of similarity between two hierarchical clusterings. The measure, B{\textless}sub{\textgreater}k{\textless}/sub{\textgreater}, is derived from the matching matrix, [ m{\textless}sub{\textgreater}ij{\textless}/sub{\textgreater}], formed by cutting the two hierarchical trees and counting the number of matching entries in the k clusters in each tree. The mean and variance of B{\textless}sub{\textgreater}k{\textless}/sub{\textgreater} are determined under the assumption that the margins of [ m{\textless}sub{\textgreater}ij{\textless}/sub{\textgreater}] are fixed. Thus, B{\textless}sub{\textgreater}k{\textless}/sub{\textgreater} represents a collection of measures for k = 2,..., n - 1. (k, B{\textless}sub{\textgreater}k{\textless}/sub{\textgreater}) plots are found to be useful in portraying the similarity of two clusterings. B{\textless}sub{\textgreater}k{\textless}/sub{\textgreater} is compared to other measures of similarity proposed respectively by Baker (1974) and Rand (1971). The use of (k, B{\textless}sub{\textgreater}k{\textless}/sub{\textgreater}) plots for studying clustering methods is explored by a series of Monte Carlo sampling experiments. An example of the use of (k, B{\textless}sub{\textgreater}k{\textless}/sub{\textgreater}) on real data is given.}, number = {383}, journal = {Journal of the American Statistical Association}, author = {Fowlkes, E. B. and Mallows, C. L.}, year = {1983}, note = {{ArticleType:} research-article / Full publication date: Sep., 1983 / Copyright {\textcopyright} 1983 American Statistical Association}, keywords = {clustering, similarity}, pages = {553--569} }, @techreport{berkhin_survey_2002, address = {San Jose, {CA}}, type = {Technical Report}, title = {Survey Of Clustering Data Mining Techniques}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.18.3739}, abstract = {Clustering is a division of data into groups of similar objects. Representing the data by fewer clusters neccessarily loses certain fine details, but achieves simplification. It models data by its clusters. Data modeling puts clustering in a historial perspective rooted in mathematics, statistics and numerical analysis. From a machine learning perspective clusters correspond to hidden patterns, the search for clusters in unsupervised learning and the resulting system represents a data concept....}, institution = {Accrue Software}, author = {Berkhin, Pavel}, year = {2002}, note = {Accessed via http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.18.3739 on 22 Nov 2011}, keywords = {clustering, data mining, survey}, pages = {56} }, @inproceedings{jurgens_s-space_2010, address = {Uppsala, Sweden}, title = {The {S-Space} Package: An Open Source Package for Word Space Models}, abstract = {We present the {S-Space} Package, an open source framework for developing and evaluating word space algorithms. The package implements well-known word space algorithms, such as {LSA}, and provides a comprehensive set of matrix utilities and data structures for extending new or existing models. The package also includes word space benchmarks for evaluation. Both algorithms and libraries are designed for high concurrency and scalability. We demonstrate the efficiency of the reference implementations and also provide their results on six benchmarks.}, booktitle = {System Papers of the Association of Computational Linguistics}, author = {Jurgens, David and Stevens, Keith}, month = jul, year = {2010}, keywords = {latent semantic indexing, semantics} }, @incollection{moha_refactorings_2008, address = {Berlin / Heidelberg}, series = {Lecture Notes in Computer Science}, title = {Refactorings of Design Defects Using Relational Concept Analysis}, volume = {4933/2008}, isbn = {978-3-540-78136-3}, abstract = {Software engineers often need to identify and correct design defects, i.e., recurring design problems that hinder development and maintenance by making programs harder to comprehend and/or evolve. While detection of design defects is an actively researched area, their correction {\textemdash} mainly a manual and time-consuming activity {\textemdash} is yet to be extensively investigated for automation. In this paper, we propose an automated approach for suggesting defect-correcting refactorings using relational concept analysis (rca). The added value of rca consists in exploiting the links between formal objects which abound in a software re-engineering context. We validated our approach on instances of the Blob design defect taken from four different open-source programs.}, booktitle = {Formal Concept Analysis}, publisher = {Springer}, author = {Moha, Naouel and Hacene, Amine and Valtchev, Petko and Gu\'{e}h\'{e}neuc, {Yann-Ga\"{e}l}}, year = {2008}, keywords = {{FCA}, inheritance, refactoring, relational concept analysis}, pages = {289--304}, annote = {http://dx.doi.org/10.1007/978-3-540-78137-0\_21} }, @incollection{kalinovsky_covert_2004, title = {Covert Java: Obfuscating Classes}, copyright = {Sample Chapter is provided courtesy of Sams}, shorttitle = {Covert Java}, url = {http://www.informit.com/articles/article.aspx?p=174368}, booktitle = {Covert Java: Techniques for Decompiling, Patching, and Reverse Engineering}, author = {Kalinovsky, Alex}, month = jul, year = {2004}, keywords = {obfuscation} }, @article{aggarwal_investigating_2007, title = {Investigating effect of design metrics on fault proneness in object-oriented Systems}, volume = {6}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.101.5689}, abstract = {Demand for quality software has undergone with rapid growth during the last few years. This is leading to an increase in the development of metrics for measuring the properties of software such as coupling, cohesion or inheritance that can be used in early quality assessments. Quality models that explore the relationship between these properties and quality attributes such as fault proneness, maintainability, effort or productivity are needed to use these metrics effectively. The goal of this work is to empirically explore the relationship between object-oriented design metrics and fault proneness of object-oriented system classes. The study used data collected from Java applications is containing 136 classes. We use a set of twenty-six design metrics in our work. Result of this study shows that many metrics are based on comparable ideas and provide redundant information. It is shown that by using a subset of metrics in the prediction models can be built to identify the faulty classes. The proposed model predicts faulty classes with more than 80\% accuracy.}, number = {10}, journal = {Journal of Object Technology}, author = {Aggarwal, K. K and Singh, Yogesh and Kaur, Arvinder and Malhotra, Ruchika}, month = dec, year = {2007}, keywords = {cohesion, coupling, empirical, inheritance, metrics}, pages = {127--141} }, @inproceedings{seng_search-based_2006, address = {Seattle, Washington, {USA}}, title = {Search-based determination of refactorings for improving the class structure of object-oriented systems}, isbn = {1-59593-186-4}, doi = {10.1145/1143997.1144315}, abstract = {A software system's structure degrades over time, a phenomenon that is known as software decay or design drift. Since the quality of the structure has major impact on the maintainability of a system, the structure has to be reconditioned from time to time. Even if recent advances in the fields of automated detection of bad smells and refactorings have made life easier for software engineers, this is still a very complex and resource consuming task. Search-based approaches have turned out to be helpful in aiding a software engineer to improve the subsystem structure of a software system. In this paper we show that such techniques are also applicable when reconditioning the class structure of a system. We describe a novel search-based approach that assists a software engineer who has to perform this task by suggesting a list of refactorings. Our approach uses an evolutionary algorithm and simulated refactorings that do not change the system's externally visible behavior. The approach is evaluated using the open-source case study {JHotDraw.}}, booktitle = {Proceedings of the 8th Annual Conference on Genetic and Evolutionary Computation}, publisher = {{ACM}}, author = {Seng, Olaf and Stammel, Johannes and Burkhart, David}, year = {2006}, keywords = {evolutionary algorithms, metrics, move method, refactoring}, pages = {1909--1916}, annote = {http://portal.acm.org/citation.cfm?doid=1143997.1144315}, annote = {The paper concentrates on the "move method" refactoring. While they advocate a genetic algorithm kind of approach, they also build in a lot of special case knowledge about where to do crossovers, etc. Relevance: 4 } }, @inproceedings{oca_identification_1998, title = {Identification of Data Cohesive Subsystems Using Data Mining Techniques}, isbn = {0-8186-8779-7}, url = {http://portal.acm.org/citation.cfm?id=850947.853324}, abstract = {The activity of reengineering and maintaining large legacy systems involves the use of design recovery techniques to produce abstractions that facilitate the understanding of the system. In this paper, we present an approach to design recovery based on data mining. This approach derives from the observation that data mining can discover unsuspected non-trivial relationships among elements in large databases. This observation suggests that data mining can be used to elicit new knowledge about the design of a subject system and that it can be applied to large legacy systems. We describe the {ISA} methodology which uses data mining to identify data cohesive subsystems. We were able to decompose {COBOL} systems into subsystems by using this approach. Our experience shows that data mining can identify data cohesive subsystems without any previous knowledge of the subject system. Furthermore, data mining can produce meaningful results regardless of system size making this approach especially appropriate to the analysis of large undocumented systems.}, booktitle = {Proceedings of the International Conference on Software Maintenance}, publisher = {{IEEE} Computer Society}, author = {Oca, Carlos Montes de and Carver, Doris L.}, year = {1998}, keywords = {association rules, cohesion, data mining, design recovery, metrics, subsystem identification}, pages = {16} }, @article{maqbool_automated_2006, title = {Automated software clustering: An insight using cluster labels}, volume = {79}, issn = {0164-1212}, shorttitle = {Automated software clustering}, url = {http://www.sciencedirect.com/science/article/B6V0N-4JT3RXB-1/2/6003e06cefd15d1d6193c28ef95b8be9}, doi = {10.1016/j.jss.2006.03.013}, abstract = {Clustering techniques have shown promising results for the architecture recovery and re-modularization of legacy software systems. Clusters that are obtained as a result of the clustering process may not be easy to interpret until they are assigned appropriate labels. Automatic labeling of clusters reduces the time required to understand them and can also be used to evaluate the effectiveness of the clustering process, if the assigned labels are meaningful and convey the purpose of each cluster effectively. In this paper, we present a labeling scheme based on identifiers of an entity. As the clustering process proceeds, keywords within identifiers are ranked using two ranking schemes: frequency and inverse frequency. We present experimental results to demonstrate the effectiveness of our labeling approach. A comparison between the ranking schemes reveals the inverse frequency scheme to form more meaningful labels, especially for small clusters. A comparison of clustering results of the complete and weighted combined algorithms based on labels of the clusters produced by them during clustering shows that the latter produces a more understandable cluster hierarchy with easily identifiable software sub-systems.}, number = {11}, journal = {Journal of Systems and Software}, author = {Maqbool, O. and Babri, {H.A.}}, month = nov, year = {2006}, keywords = {clustering, semantics, software clustering}, pages = {1632--1648} }, @article{rolling_preliminary_1994, title = {A preliminary annotated bibliography on domain engineering}, volume = {19}, url = {http://portal.acm.org/citation.cfm?id=182844}, doi = {10.1145/182824.182844}, number = {3}, journal = {{SIGSOFT} Softw. Eng. Notes}, author = {Rolling, Walter A.}, year = {1994}, keywords = {domain analysis, domain engineering, {OOD}, software reuse}, pages = {82--84} }, @article{fortunato_community_2010, title = {Community detection in graphs}, volume = {486}, issn = {0370-1573}, url = {http://www.sciencedirect.com/science/article/B6TVP-4XPYXF1-1/2/99061fac6435db4343b2374d26e64ac1}, doi = {10.1016/j.physrep.2009.11.002}, abstract = {The modern science of networks has brought significant advances to our understanding of complex systems. One of the most relevant features of graphs representing real systems is community structure, or clustering, i.e. the organization of vertices in clusters, with many edges joining vertices of the same cluster and comparatively few edges joining vertices of different clusters. Such clusters, or communities, can be considered as fairly independent compartments of a graph, playing a similar role like, e.g., the tissues or the organs in the human body. Detecting communities is of great importance in sociology, biology and computer science, disciplines where systems are often represented as graphs. This problem is very hard and not yet satisfactorily solved, despite the huge effort of a large interdisciplinary community of scientists working on it over the past few years. We will attempt a thorough exposition of the topic, from the definition of the main elements of the problem, to the presentation of most methods developed, with a special focus on techniques designed by statistical physicists, from the discussion of crucial issues like the significance of clustering and how methods should be tested and compared against each other, to the description of applications to real networks.}, number = {3-5}, journal = {Physics Reports}, author = {Fortunato, Santo}, month = feb, year = {2010}, keywords = {clustering, Graphs}, pages = {75--174} }, @inproceedings{irwin_object_2003, title = {Object oriented metrics: Precision tools and configurable visualisations}, abstract = {Software metrics are a valuable tool in helping software engineers to develop large, complex software systems. However, it is vital that transparency and precision are maintained at all stages. We contend that without grammars we cannot define metrics rigorously, without transparent and powerful parsing tools we cannot collect data accurately and without flexible configurable visualisation we cannot exploit the full potential of our data. In this paper, we report the development of {JST}, a semantic analyser for Java, and show how it is incorporated into our pipeline-based approach to metrics collection and visualisation. We describe a new visualisation, class clusters, which not only demonstrate the data generated by our tools but also illustrate the value of {3D} virtual worlds for visualising software metrics.}, booktitle = {9th International Software Metrics Symposium}, author = {Irwin, W. and Churcher, N.}, year = {2003}, keywords = {metrics, visualization}, pages = {112--123} }, @article{czibula_hierarchical_2007-1, title = {Hierarchical Clustering for Software Systems Restructuring}, volume = {6}, abstract = {Improving the quality of software systems design is the most important issue during the evolution of object oriented software systems. In this paper we are focusing on the problem of determining refactorings that can be used in order to improve the design of object oriented software systems. Refactoring ([6]) is a major issue to improve internal software quality. This paper aims at presenting a new hierarchical agglomerative clustering algorithm, {HARS} {(Hierarchical} agglomerative clustering algorithm for restructuring software systems), that identifies the refactorings needed in order to restructure a software system. Clustering([10]) is used in order to recondition the class structure of the system. The proposed approach can be useful for assisting software engineers in their daily works of refactoring software systems. We evaluate our approach using the open source case study {JHotDraw} ([7]), emphasizing its advantages in comparison with existing approaches.}, journal = {{INFOCOMP} Journal of Computer Science, Brasil}, author = {Czibula, {Istv\'{a}n-Gergely} and Serban, Gabriela}, year = {2007}, keywords = {clustering, refactoring, software clustering}, pages = {43{\textendash}51} }, @article{parsons_subspace_2004, title = {Subspace clustering for high dimensional data: a review}, volume = {6}, shorttitle = {Subspace clustering for high dimensional data}, url = {http://portal.acm.org/citation.cfm?id=1007730.1007731}, doi = {10.1145/1007730.1007731}, abstract = {Subspace clustering is an extension of traditional clustering that seeks to find clusters in different subspaces within a dataset. Often in high dimensional data, many dimensions are irrelevant and can mask existing clusters in noisy data. Feature selection removes irrelevant and redundant dimensions by analyzing the entire dataset. Subspace clustering algorithms localize the search for relevant dimensions allowing them to find clusters that exist in multiple, possibly overlapping subspaces. There are two major branches of subspace clustering based on their search strategy. Top-down algorithms find an initial clustering in the full set of dimensions and evaluate the subspaces of each cluster, iteratively improving the results. Bottom-up approaches find dense regions in low dimensional spaces and combine them to form clusters. This paper presents a survey of the various subspace clustering algorithms along with a hierarchy organizing the algorithms by their defining characteristics. We then compare the two main approaches to subspace clustering using empirical scalability and accuracy tests and discuss some potential applications where subspace clustering could be particularly useful.}, number = {1}, journal = {{SIGKDD} Explor. Newsl.}, author = {Parsons, Lance and Haque, Ehtesham and Liu, Huan}, year = {2004}, keywords = {clustering, high dimensional data, subspace clustering, survey}, pages = {90--105} }, @inproceedings{wheeldon_power_2003, title = {Power law distributions in class relationships}, abstract = {Power law distributions have been found in many natural and social phenomena, and more recently in the source code and run-time characteristics of {Object-Oriented} {(OO)} systems. A power law implies that small values are extremely common, whereas large values are extremely rare. We identify twelve new power laws relating to the static graph structures of Java programs. The graph structures analyzed represented different forms of {OO} coupling, namely, inheritance, aggregation, interface, parameter type and return type. Identification of these new laws provides the basis for predicting likely features of classes in future developments. The research ties together work in object-based coupling and World Wide Web structures.}, booktitle = {Source Code Analysis and Manipulation, 2003. Proceedings. Third {IEEE} International Workshop on}, author = {Wheeldon, R. and Counsell, S.}, year = {2003}, keywords = {aggregation, coupling, data flow graphs, inheritance, {OOP}, powerlaws, static graph structure}, pages = {45--54} }, @book{hartigan_clustering_1975, title = {Clustering algorithms}, isbn = {9780471356455}, publisher = {Wiley}, author = {Hartigan, John A.}, year = {1975}, keywords = {algorithms, cluster analysis, Electronic data processing, Numerical taxonomy} }, @techreport{cassell_initial_2010, address = {Wellington, {NZ}}, type = {Technical Report}, title = {An Initial Test Suite for Automated Extract Class Refactorings}, url = {http://ecs.victoria.ac.nz/twiki/pub/Main/TechnicalReportSeries/ECSTR10-21.pdf}, abstract = {When developing object-oriented classes, it is difficult to determine how to best refactor large, complex classes to create smaller, more cohesive ones. Automated algorithms can recommend solutions, but how can a programmer feel confident that an algorithm's recommendations are good ones? The test suite described here provides test classes for use as inputs to these automated algorithms, together with the preferred results - the class members that should be distributed to the extracted class. By comparing the actual results to the expected ones, a programmer can have some confidence that his algorithms are providing useful suggestions for refactoring.}, number = {{ECSTR} 10-21}, institution = {Victoria University of Wellington, Dept. {ECS}}, author = {Cassell, Keith and Andreae, Peter and Groves, Lindsay and Noble, James}, month = sep, year = {2010}, keywords = {extract class, refactoring, test suite}, pages = {13} }, @article{salton_vector_1975, title = {A vector space model for automatic indexing}, volume = {18}, issn = {00010782}, url = {http://portal.acm.org/citation.cfm?id=361220}, doi = {10.1145/361219.361220}, abstract = {In a document retrieval, or other pattern matching environment where stored entities (documents) are compared with each other or with incoming patterns (search requests), it appears that the best indexing (property) space is one where each entity lies as far away from the others as possible; in these circumstances the value of an indexing system may be expressible as a function of the density of the object space; in particular, retrieval performance may correlate inversely with space density. An approach based on space density computations is used to choose an optimum indexing vocabulary for a collection of documents. Typical evaluation results are shown, demonstating the usefulness of the model.}, number = {11}, journal = {Communications of the {ACM}}, author = {Salton, G. and Wong, A. and Yang, C. S.}, month = nov, year = {1975}, keywords = {information retrieval, semantics}, pages = {613--620} }, @article{antonellis_employing_2008, title = {Employing Clustering for Assisting Source Code Maintainability Evaluation according to {ISO/IEC-9126}}, abstract = {This paper elaborates on how to use clustering for the evaluation of a software system{\textquoteright}s maintainability according to the {ISO/IEC-9126} quality standard. More specifically it proposes a methodology that combines clustering and multicriteria decision aid techniques for knowledge acquisition by integrating groups of data from source code with the expertise of a software system{\textquoteright}s evaluators. A process for the extraction of elements from source code and Analytical Hierarchical Processing for assigning weights to these data are provided; {k-Attractors} clustering algorithm is then applied on these data, in order to produce system overviews and deductions. The methodology is evaluated on Apache Geronimo, a large Open Source Application Server; results are discussed and conclusions are presented together with directions for future work.}, author = {Antonellis, P. and Antoniou, D. and Kanellopoulos, Y. and Makris, C. and Theodoridis, E. and Tjortjis, C. and Tsirakis, N.}, year = {2008}, keywords = {clustering, maintainability, software clustering} }, @book{astels_test_2003, title = {Test Driven development: A Practical Guide}, isbn = {0131016490}, publisher = {Prentice Hall Professional Technical Reference}, author = {Astels, Dave}, year = {2003} }, @article{stein_exploring_2005, title = {Exploring the relationship between cohesion and complexity}, url = {http://findarticles.com/p/articles/mi_m0VVT/is_2_1/ai_n25121083/}, abstract = {Many metrics have been proposed to measure the complexity or cohesion of object-oriented software. However, the complexity or cohesion of a piece of software is more difficult to capture than these metrics imply. In fact, studies have shown that existing metrics consistently fail to capture complexity or cohesion well. This study explores the reasons behind these results: cohesion is difficult to capture from syntactic elements of code, complexity is too multi-faceted to be captured by one metric and the qualities of complexity and cohesion are not independent. These factors have resulted in metrics that are purported to measure complexity or cohesion but are inadequate or misclassified. This study shows that there is overlap between some of the complexity and cohesion metrics and points to a more basic relationship between complexity and cohesion: that a lack of cohesion may be associated with high complexity.}, journal = {Journal of Computer Science}, author = {Stein, Cara E. and Cox, Glenn W. and Etzkorn, L. H.}, month = apr, year = {2005}, keywords = {cohesion, complexity, metrics} }, @book{ambler_object_2004, title = {The object primer: agile model-driven development with {UML} 2.0}, isbn = {9780521540186}, shorttitle = {The object primer}, publisher = {Cambridge University Press}, author = {Ambler, Scott W.}, year = {2004}, annote = {{"Refactor} ruthlessly"is one of his philosophies for effective programming. } }, @techreport{cassell_tool_1985, title = {A Tool for the Static Analysis of Prolog Programs}, number = {{MCC} Technical Report Number {DB-148-85}}, institution = {{MCC}}, author = {Cassell, Keith and Keller, Tom}, year = {1985} }, @inproceedings{martin_oo_1994, title = {{OO} design quality metrics}, abstract = {This paper describes a set of metrics that can be used to measure the quality of an object-oriented design in terms of the interdependence between the subsystems of that design. Designs which are highly interdependent tend to be rigid, unreusable and hard to maintain. Yet interdependence is necessary if the subsystems of the design are to collaborate. Thus, some forms of dependency must be desirable, and other forms must be undesirable. This paper proposes a design pattern in which all the dependencies are of the desirable form. Finally, this paper describes a set of metrics that measure the conformance of a design to the desirable pattern.}, author = {Martin, Robert}, month = oct, year = {1994}, keywords = {coupling, metrics} }, @inproceedings{bergel_visualizing_2010, title = {Visualizing Dynamic Metrics with Profiling Blueprints}, abstract = {Abstract. While traditional approaches to code pro?ling help locate performance bottlenecks, they o?er only limited support for removing these bottlenecks. The main reason is the lack of visual and detailed runtime information to identify and eliminate computation redundancy. We provide two pro?ling blueprints which help identify and remove performance bottlenecks. The structural distribution blueprint graphically represents the {CPU} consumption share for each method and class of an application. The behavioral distribution blueprint depicts the distribution of {CPU} consumption along method invocations, and hints at method candidates for caching optimizations. These two blueprints helped us to signi?cantly optimize Mondrian, an open source visualization engine. Our implementation is freely available for the Pharo development environment and has been evaluated in a number of di?erent scenarios.}, booktitle = {Proceedings of the 48th International Conference on Objects, Models, Components, Patterns}, publisher = {{LNCS} Springer Verlag}, author = {Bergel, Alexandre and Robbes, Romain and Binder, Walter}, year = {2010}, keywords = {metrics, visualization} }, @article{mitchell_clustering_2007, title = {Clustering Software Systems to Identify Subsystem Structures}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.8.5490}, doi = {10.1.1.8.5490}, abstract = {As the size of software systems continues to grow, understanding the structure of these systems gets harder. This coupled with associated problems such as of lack of current documentation, and the limited or nonexistent availability of the original designers of the system, adds further difficulty to the job of software professionals trying to understand the structure of large and complex systems. The application of clus- tering techniques and tools to software systems helps software designers, developers, and maintenance pro- grammers by recovering high-level views of system de- signs. In this paper we survey clustering approaches that have been developed by software engineering re- searchers. We also examine classical clustering tech- niques that have been applied in mathematics, science, and engineering, and investigate how these techniques have been adapted to work in the software domain. We conclude with a discussion of open research challenges related to software clustering.}, author = {Mitchell, Brian}, year = {2007}, keywords = {clustering, data mining, reverse engineering, software clustering, subsystem identification}, annote = {This is difficult to reference. It appears to be an unnumbered Drexel University technical report from around 2000, but Mitchell seems to have published related work. Relevance: 3 } }, @inproceedings{zhang_empirical_2007, address = {Nagoya, Aichi, Japan}, title = {An Empirical Study of Class Sizes for Large Java Systems}, url = {http://ieeexplore.ieee.org/Xplore/login.jsp?url=http%3A%2F%2Fieeexplore.ieee.org%2Fiel5%2F4425817%2F4425818%2F04425860.pdf%3Farnumber%3D4425860&authDecision=-203}, doi = {10.1109/ASPEC.2007.64}, booktitle = {14th {Asia-Pacific} Software Engineering Conference {(APSEC'07)}}, author = {Zhang, Hongyu and Tan, Hee Beng Kuan}, month = dec, year = {2007}, keywords = {empirical, metrics, size}, pages = {230--237}, annote = {{LOC} only } }, @inproceedings{anquetil_legacy_2011, address = {Los Alamitos, {CA}, {USA}}, title = {Legacy Software Restructuring: Analyzing a Concrete Case}, shorttitle = {Legacy Software Restructuring}, doi = {http://doi.ieeecomputersociety.org/10.1109/CSMR.2011.34}, abstract = {Software re-modularization is an old preoccupation of reverse engineering research. The advantages of a well structured or modularized system are well known. Yet after so much time and efforts, the field seems unable to come up with solutions that make a clear difference in practice. Recently, some researchers started to question whether some basic assumptions of the field were not overrated. The main one consists in evaluating the high-cohesion/low-coupling dogma with metrics of unknown relevance. In this paper, we study a real structuring case (on the Eclipse platform) to try to better understand if (some) existing metrics would have helped the software engineers in the task. Results show that the cohesion and coupling metrics used in the experiment did not behave as expected and would probably not have helped the maintainers reach there goal. We also measured another possible restructuring which is to decrease the number of cyclic dependencies between modules. Again, the results did not meet expectations.}, booktitle = {Software Maintenance and Reengineering, European Conference on}, publisher = {{IEEE} Computer Society}, author = {Anquetil, Nicolas and Laval, Jannik}, year = {2011}, keywords = {Cohesion, Coupling, metrics, re-modularization, restructuring}, pages = {279--286} }, @article{mens_survey_2004-1, title = {A survey of software refactoring}, volume = {30}, issn = {0098-5589}, doi = {10.1109/TSE.2004.1265817}, abstract = {We provide an extensive overview of existing research in the field of software refactoring. This research is compared and discussed based on a number of different criteria: the refactoring activities that are supported, the specific techniques and formalisms that are used for supporting these activities, the types of software artifacts that are being refactored, the important issues that need to be taken into account when building refactoring tool support, and the effect of refactoring on the software process. A running example is used to explain and illustrate the main concepts.}, number = {2}, journal = {{IEEE} Transactions on Software Engineering}, author = {Mens, T. and Tourwe, T.}, month = feb, year = {2004}, keywords = {bibliographies, construction tool, Costs, object-oriented programming, programming environment, programming environments, reverse engineering, software maintenance, software quality, software refactoring, software reusability, Software tools, Spirals, taxonomy}, pages = {126-- 139} }, @inproceedings{crespo_relative_2006, address = {Lugano, Switzerland}, title = {Relative Thresholds: Case Study to Incorporate Metrics in the Detection of Bad Smells}, shorttitle = {Relative Thresholds}, abstract = {To detect flaws, bad smells, etc, we often use quantitative methods: metrics or measures. It is common in practice to use thresholds to set the correctness of the measures. Most of the current tools use generic values. Nevertheless, there is a certain concern about the effects of threshold applications on obtained values. Current research is working on case studies of thresholds for several products and different versions. However, product domain and size could also affect the results, so we deal with the question of using generic vs. relative thresholds, looking at what effects this could have in bad smell detection.}, booktitle = {Proceedings of 10th {ECOOP} Workshop on Quantitative Approaches in {Object-Oriented} Software Engineering}, author = {Crespo, Y. and Lopez, C. and Marticorena, R.}, year = {2006}, keywords = {empirical, metrics, refactoring, smells}, pages = {109--118}, annote = {Contains empirical data for a number of metrics on a number of open source projects. Has a large class detection query. Notes long tails causing differences between the medians and means. Also notes significant variances in distributions across projects.} }, @book{lorenz_object-oriented_1994, address = {Englewood Cliffs, {NJ}}, series = {Prentice Hall object-oriented series}, title = {{Object-Oriented} Software Metrics: A Practical Guide}, isbn = {{013179292X}}, lccn = {{QA76.64} {.L67} 1994}, shorttitle = {{Object-Oriented} Software Metrics}, publisher = {{PTR} Prentice Hall}, author = {Lorenz, Mark and Kidd, Jeff}, year = {1994}, keywords = {metrics}, annote = {One of the earliest books on {OO} metrics. It includes recommended thresholds for various metrics, e.g. \# of public instance methods in a class should be {\textless}= 20 ({\textless}= 40 {UI).} Instance variables {\textless}= 3 ({\textless}=9 {UI).} It also considers "affecting factors for the various metrics, for example, whether it is a {UI} class, how high the class is in the inheritance hierarchy, etc. Relevance: 3} }, @inproceedings{chaumun_design_2000, title = {Design properties and object-oriented software changeability}, doi = {10.1109/CSMR.2000.827305}, abstract = {The assessment of the changeability of software systems is of major concern for buyers of the large systems found in fast-moving domains such as telecommunications. One way of approaching this problem is to investigate the dependency between the changeability of the software and its design, with the goal of finding design properties that can be used as changeability indicators. In our research, we defined a model of software changes and change impacts, and implemented it for the C++ language. Furthermore, we identified a set of nine object-oriented {(OO)} design metrics, four of which are specifically geared towards changeability detection. The model and the metrics were applied to three test systems of industrial size. The experiment showed a high correlation, across systems and across changes, between changeability and the access to a class by other classes through method invocation or variable access. On the other hand, no result could support the hypothesis that the depth of the inheritance tree has some influence on changeability. Furthermore, our results confirm the observation of others that the use of inheritance is rather limited in industrial systems}, booktitle = {Software Maintenance and Reengineering, 2000. Proceedings of the Fourth European}, author = {Chaumun, M. A. and Kabaili, H. and Keller, {R.K.} and Lustman, F. and {Saint-Denis}, G.}, year = {2000}, keywords = {correlation, inheritance, maintainability, maintenance, metrics, {OOD}, {OOP}}, pages = {45--54} }, @incollection{dig_refactoring_2008, title = {Refactoring Tools}, url = {http://dx.doi.org/10.1007/978-3-540-78195-0_19}, abstract = {{WRT{\textquoteright}07} was the first instance of the Workshop on Refactoring Tools. It was held in Berlin, Germany, on July 31st, in conjunction with {ECOOP{\textquoteright}07.} The workshop brought together over 50 participants from both academia and industry. Participants include the lead developers of two widely used refactoring engines {(Eclipse} and {NetBeans)}, researchers that work on refactoring tools and techniques, and others generally interested in refactoring. {WRT{\textquoteright}07} accepted 32 submissions, however, it was impossible to present all these submissions in one single day. Instead, in the morning session we started with a few technical presentations, followed by large group discussions around noon, a poster session and small group discussions in the afternoon. {WRT{\textquoteright}07} ended with a retrospective session and unanimous consensus to organize another session in the future.}, booktitle = {{Object-Oriented} Technology. {ECOOP} 2007 Workshop Reader}, author = {Dig, Danny and Johnson, Ralph and Tip, Frank and De Moor, Oege and Becicka, Jan and Griswold, William and Keller, Markus}, year = {2008}, keywords = {Eclipse, refactoring}, pages = {193--202} }, @inproceedings{demeyer_lan-simulation:_2005, title = {The {LAN-simulation:} A Refactoring Teaching Example}, isbn = {0-7695-2349-8}, shorttitle = {The {LAN-simulation}}, url = {http://portal.acm.org/citation.cfm?id=1107840.1108151}, abstract = {The notion of refactoring - transforming the sourcecode of an object-oriented program without changing its external behaviour - has been studied intensively within the last decade. This diversity has created a plethora of toy-examples, cases and code snippets, which make it hard to assess the current state-of-the-art. Moreover, due to this diversity, there is currently no accepted way of teaching good refactoring practices, despite the acknowledgment in the software engineering body of knowledge. Therefore, this paper presents a common example - the {LAN} simulation - which has been used by a number of European Universities for both research and teaching purposes.}, booktitle = {Proceedings of the Eighth International Workshop on Principles of Software Evolution}, publisher = {{IEEE} Computer Society}, author = {Demeyer, Serge and Rysselberghe, Filip Van and Girba, Tudor and Ratzinger, Jacek and Marinescu, Radu and Mens, Tom and Bois, Bart Du and Janssens, Dirk and Ducasse, St\'{e}phane and Lanza, Michele and Rieger, Matthias and Gall, Harald and {El-Ramly}, Mohammad}, year = {2005}, keywords = {refactoring, test suite}, pages = {123--134} }, @inproceedings{fokaefs_jdeodorant:_2011, address = {New York, {NY}, {USA}}, series = {{ICSE} '11}, title = {{JDeodorant:} identification and application of extract class refactorings}, isbn = {978-1-4503-0445-0}, location = {Waikiki, Honolulu, {HI}, {USA}}, shorttitle = {{JDeodorant}}, doi = {10.1145/1985793.1985989}, abstract = {Evolutionary changes in object-oriented systems can result in large, complex classes, known as {"God} Classes". In this paper, we present a tool, developed as part of the {JDeodorant} Eclipse plugin, that can recognize opportunities for extracting cohesive classes from {"God} Classes" and automatically apply the refactoring chosen by the developer.}, booktitle = {Proceeding of the 33rd International Conference on Software Engineering}, publisher = {{ACM}}, author = {Fokaefs, Marios and Tsantalis, Nikolaos and Stroulia, Eleni and Chatzigeorgiou, Alexander}, year = {2011}, keywords = {clustering, design, object-oriented programming, refactoring, software clustering, software reengineering}, pages = {1037{\textendash}1039} }, @inproceedings{streckenbach_refactoring_2004, address = {Vancouver, {BC}, Canada}, title = {Refactoring class hierarchies with {KABA}}, isbn = {1-58113-831-9}, doi = {10.1145/1028976.1029003}, abstract = {{KABA} is an innovative system for refactoring Java class hierar-chies. It uses the {Snelting/Tip} algorithm [13] in order to determine a behavior-preserving refactoring which is optimal with respect to a given set of client programs. {KABA} can be based on dynamic as well as static program analysis. The static variant will preserve program behavior for all possible input values; the dynamic version guarantees preservation of behavior for all runs in a given test suite. {KABA} offers automatic refactoring as well as manual refactoring using a dedicated editor.}, booktitle = {Proceedings of the 19th Annual {ACM} {SIGPLAN} Conference on {Object-Oriented} Programming, Systems, Languages, and Applications}, publisher = {{ACM}}, author = {Streckenbach, Mirko and Snelting, Gregor}, year = {2004}, keywords = {inheritance, refactoring}, pages = {315--330}, annote = {http://portal.acm.org/citation.cfm?id=1028976.1029003} }, @inproceedings{cordy_comprehending_2003, title = {Comprehending reality-practical barriers to industrial adoption of software maintenance automation}, isbn = {0769518834}, abstract = {Recent years have seen many significant advances in program comprehension and software maintenance automation technology. In spite of the enormous potential savings in software maintenance costs, for the most part adoption of these ideas in industry remains at the experimental prototype stage. In this paper I explore some of the practical reasons for industrial resistance to adoption of software maintenance automation. Based on the experience of six years of software maintenance automation services to the financial industry involving more than 4.5 Gloc of code at Legasys Corporation, I discuss some of the social, technical and business realities that lie at the root of this resistance, outline various Legasys attempts overcome these barriers, and suggest some approaches to software maintenance automation that may lead to higher levels of industrial acceptance in the future.}, booktitle = {11th {IEEE} International Workshop on Program Comprehension}, author = {Cordy, J. R}, year = {2003}, keywords = {maintainability, maintenance}, pages = {196{\textendash}205} }, @techreport{cassell_visualizing_2010, address = {Wellington, {NZ}}, type = {Technical Report}, title = {Visualizing Class Refactoring via Clustering}, url = {http://ecs.victoria.ac.nz/twiki/pub/Main/TechnicalReportSeries/ECSTR10-17.pdf}, abstract = {When developing object-oriented classes, it is difficult to determine how to best reallocate the members of large, complex classes to create smaller, more cohesive ones. Clustering techniques can provide guidance on how to solve this alloca- tion problem; however, inappropriate use of clustering can result in a class structure that is less maintainable than the original. The {ExtC} Visualizer helps the programmer understand the class structure by visually emphasizing important features of the class's members and their interelationships. More importantly, it helps users see how various clustering algorithms group the class's members. These insights help a programmer choose appropriate techniques for refactoring large classes.}, number = {{ECSTR} 10-17}, institution = {Victoria University of Wellington, Dept. {ECS}}, author = {Cassell, Keith and Anslow, Craig and Groves, Lindsay and Andreae, Peter}, month = jul, year = {2010}, keywords = {clustering, graphs, refactoring, visualization}, pages = {10} }, @article{cohen_jtl:_2006, title = {{JTL:} the Java tools language}, volume = {41}, shorttitle = {{JTL}}, url = {http://portal.acm.org/citation.cfm?id=1167481&dl=GUIDE&coll=GUIDE&CFID=18765553&CFTOKEN=94706159}, doi = {10.1145/1167515.1167481}, abstract = {We present an overview of {JTL} (the Java Tools Language, pronounced {"Gee-tel")}, a novel language for querying {JAVA} [8] programs. {JTL} was designed to serve the development of source code software tools for {JAVA}, and as a small language which to aid programming language extensions to {JAVA.} Applications include definition of pointcuts for aspect-oriented programming, fixing type constraints for generic programming, specification of encapsulation policies, definition of micro-patterns, etc. We argue that the {JTL} expression of each of these is systematic, concise, intuitive and {general.JTL} relies on a simply-typed relational database for program representation, rather than an abstract syntax tree. The underlying semantics of the language is restricted to queries formulated in First Order Predicate Logic augmented with transitive closure {(FOPL).Special} effort was taken to ensure terse, yet readable expression of logical conditions. The {JTL} pattern {{\textless}B{\textgreater}public} abstract {class{\textless}/B{\textgreater}}, for example, matches all abstract classes which are publicly accessible, while {{\textless}B{\textgreater}class{\textless}/B{\textgreater}} {({\textless}B{\textgreater}public{\textless}/B{\textgreater}} clone();) matches all classes in which method clone is public. To this end, {JTL} relies on a {DATALOG-like} syntax and semantics, enriched with quantifiers and pattern matching which all but entirely eliminate the need for recursive {calls.JTL's} query analyzer gives special attention to the fragility of the "closed world assumption" in examining {JAVA} software, and determines whether a query relies on such an {assumption.The} performance of the {JTL} interpreter is comparable to that of {JQuery} after it generated its database cache, and at least an order of magnitude faster when the cache has to be rebuilt.}, number = {10}, journal = {{SIGPLAN} Not.}, author = {Cohen, Tal and Gil, Joseph {(Yossi)} and Maman, Itay}, year = {2006}, keywords = {declarative programming, query language, reverse engineering}, pages = {89--108}, annote = {This paper is difficult to read. It describes a Java-specific query language with a semantics somewhat similar to {Prolog/Datalog.} The syntax of the language appears convoluted. It appears from the examples that they have tried to make the pattern matching simple for Java programmers, but I think this has made it more difficult to grasp as there is too much "syntactic sugar". Like {.QL}, it stores program information in a database and reasons over it. It claims to provide both queries and actions, so in theory, one should be able to both detect a pattern and modify it. This paper was written in 2006. The first release 1.0 or above was in 10/2007, so there has probably been considerable improvement. See http://openjtl.sourceforge.net. Relevance: 5} }, @article{chae_cohesion_2000, title = {A cohesion measure for object-oriented classes}, volume = {30}, abstract = {In object-oriented systems, cohesion refers to the degree of the relatedness of the members in a class and strong cohesion has been recognized as a highly desirable property of classes. We note that the existing cohesion measures do not take into account some characteristics of classes, and thus often fail to properly reflect the cohesiveness of classes. To cope with such a problem, we propose a new cohesion measure where the characteristics of classes are incorporated. Our cohesion measure takes into account the members that actually have impact on the cohesiveness of a class, and is defined in terms of the degree of the connectivity among those members. We develop a cohesion measurement tool for C++ programs, and perform a case study on a wellknown class library in order to demonstrate the effectiveness of our new measure. By performing principal component analysis, we also demonstrate that our measure captures a new aspect of class properties which is not captured by the existing cohesion measures.}, number = {12}, journal = {Software Practice and Experience}, author = {Chae, Heung Seok and Kwon, Yong Rae and Bae, Doo- Hwan}, year = {2000}, keywords = {cohesion, comparative study, graphs, metrics}, pages = {1405--1431}, annote = {Discusses drawbacks with existing cohesion metrics, especially having to do with special methods (accessors, delegators, constructors, destructors). Includes some figures with representative graphs that illustrate the weaknesses of the metrics. Includes tables showing the variety of values that can be produced by different metrics for the same graph. Defines "reference graph" as an undireted graph. It is two levels - methods and attributes. Introduces a graph-based metric for cohesion {(CBMC).} This metric is based on separating out maximally connected components via removals of nodes representing glue methods. Relevance: 5}, annote = {http://portal.acm.org/citation.cfm?id=362449} }, @book{demeyer_object_2002, edition = {1st}, title = {Object Oriented Reengineering Patterns}, isbn = {1558606394}, publisher = {Morgan Kaufmann}, author = {Demeyer, Serge and Ducasse, Stephane and Nierstrasz, Oscar}, month = jul, year = {2002}, keywords = {patterns, restructuring} }, @article{mccabe_complexity_1976, title = {A Complexity Measure}, volume = {{SE-2}}, issn = {0098-5589}, abstract = {This paper describes a graph-theoretic complexity measure and illustrates how it can be used to manage and control program complexity. The paper first explains how the graph-theory concepts apply and gives an intuitive explanation of the graph concepts in programming terms. The control graphs of several actual Fortran programs are then presented to illustrate the correlation between intuitive complexity and the graph-theoretic complexity. Several properties of the graph-theoretic complexity are then proved which show, for example, that complexity is independent of physical size (adding or subtracting functional statements leaves complexity unchanged) and complexity depends only on the decision structure of a program.}, number = {4}, journal = {{IEEE} Transactions on Software Engineering}, author = {{McCabe}, {T.J.}}, year = {1976}, keywords = {complexity, decomposition, metrics, testing}, pages = {308--320} }, @article{arnold_software_1989, title = {Software restructuring}, volume = {77}, issn = {0018-9219}, doi = {10.1109/5.24146}, abstract = {The author provides a brief tutorial on software restructuring. He discusses what restructuring is, advantages and disadvantages of restructuring, tools and case studies, and future possibilities. The reader is assumed to have a general appreciation for building and maintaining software systems. The aim is to provide the reader with a feel for the strengths, weaknesses, and capabilities of software restructuring technology}, number = {4}, journal = {Proceedings of the {IEEE}}, author = {Arnold, Robert S.}, year = {1989}, keywords = {restructuring, tools, tutorial}, pages = {607--617} }, @inproceedings{corazza_investigating_2011, address = {Los Alamitos, {CA}, {USA}}, title = {Investigating the Use of Lexical Information for Software System Clustering}, doi = {http://doi.ieeecomputersociety.org/10.1109/CSMR.2011.8}, abstract = {Developers have a lot of freedom in writing comments as well as in choosing identifiers and method names. These are intentional in nature and provide a different relevance of information to understand what a software system implements, and in particular the role of each source file. In this paper we investigate the effectiveness of exploiting lexical information for software system clustering. In particular we explore the contribution of the combined use of six different dictionaries, corresponding to the six parts of the source code where programmers introduce lexical information, namely: class, attribute, method and parameter names, comments, and source code statements. Their relevance has been weighted by means of a probabilistic model, whose parameters have been estimated by the {Expectation-Maximization} algorithm. To group source files accordingly we used a hierarchical clustering algorithm. The investigation has been conducted on a dataset of 13 open source Java software systems.}, booktitle = {Software Maintenance and Reengineering, European Conference on}, publisher = {{IEEE} Computer Society}, author = {Corazza, Anna and Martino, Sergio Di and Maggio, Valerio and Scanniello, Giuseppe}, year = {2011}, keywords = {clustering, lexical information, probabilistic model, software clustering, software remodularization}, pages = {35--44} }, @article{briand_unified_1998, title = {A unified framework for cohesion measurement in object-oriented systems}, volume = {3}, issn = {1382-3256}, url = {http://dx.doi.org/10.1023/A:1009783721306}, abstract = {The increasing importance being placed on software measurement has led to an increased amount of research developing new software measures. Given the importance of object-oriented development techniques, one specific area where this has occurred is cohesion measurement in object-oriented systems. However, despite a very interesting body of work, there is little understanding of the motivation and empirical hypotheses behind many of these new measures. It is often difficult to determine how such measures relate to one another and for which application they can be used. As a consequence, it is very difficult for practitioners and researchers to obtain a clear picture of the state-of-the-art in order to select or define cohesion measures for object-oriented systems. This situation is addressed and clarified through several different activities. First, a standardized terminology and formalism for expressing measures is provided which ensures that all measures using it are expressed in a fully consistent and operational manner. Second, to provide a structured synthesis, a review of the existing approaches to measure cohesion in object-oriented systems takes place. Third, a unified framework, based on the issues discovered in the review, is provided and all existing measures are then classified according to this framework. Finally, a review of the empirical validation work concerning existing cohesion measures is provided. This paper contributes to an increased understanding of the state-of-the-art: a mechanism is provided for comparing measures and their potential use, integrating existing measures which examine the same concepts in different ways, and facilitating more rigorous decision making regarding the definition of new measures and the selection of existing measures for a specific goal of measurement. In addition, our review of the state-of-the-art highlights several important issues: (i) many measures are not defined in a fully operational form, (ii) relatively few of them are based on explicit empirical models as recommended by measurement theory, and (iii) an even smaller number of measures have been empirically validated; thus, the usefulness of many measures has yet to be demonstrated.}, number = {1}, journal = {Empirical Software Engineering}, author = {Briand, Lionel C. and Daly, John W. and W\"{u}st, J\"{u}rgen}, year = {1998}, note = {{10.1023/A:1009783721306}}, pages = {65{\textendash}117} }, @inproceedings{sager_detecting_2006, address = {Shanghai, China}, title = {Detecting similar Java classes using tree algorithms}, isbn = {1-59593-397-2}, url = {http://portal.acm.org/citation.cfm?id=1138000}, doi = {10.1145/1137983.1138000}, abstract = {Similarity analysis of source code is helpful during development to provide, for instance, better support for code reuse. Consider a development environment that analyzes code while typing and that suggests similar code examples or existing implementations from a source code repository. Mining software repositories by means of similarity measures enables and enforces reusing existing code and reduces the developing effort needed by creating a shared knowledge base of code fragments. In information retrieval similarity measures are often used to find documents similar to a given query document. This paper extends this idea to source code repositories. It introduces our approach to detect similar Java classes in software projects using tree similarity algorithms. We show how our approach allows to find similar Java classes based on an evaluation of three tree-based similarity measures in the context of five user-defined test cases as well as a preliminary software evolution analysis of a medium-sized Java project. Initial results of our technique indicate that it (1) is indeed useful to identify similar Java classes, (2)successfully identifies the ex ante and ex post versions of refactored classes, and (3) provides some interesting insights into within-version and between-version dependencies of classes within a Java project.}, booktitle = {Proceedings of the 2006 international workshop on Mining software repositories}, publisher = {{ACM}}, author = {Sager, Tobias and Bernstein, Abraham and Pinzger, Martin and Kiefer, Christoph}, year = {2006}, keywords = {change analysis, reusability, similarity, software repositories}, pages = {65--71} }, @article{deerwester_indexing_1990, title = {Indexing by latent semantic analysis}, volume = {41}, abstract = {A new method for automatic indexing and retrieval is described. The approach is to take advantage of implicit higher-order structure in the association of terms with documents ("semantic structure") in order to improve the detection of relevant documents on the basis of terms found in queries. The particular technique used is singular-value decomposition, in which a large term by document matrix is decomposed into a set of ca 100 orthogonal factors from which the original matrix can be approximated by linear combination. Documents are represented by ca 100 item vectors of factor weights. Queries are represented as pseudo-document vectors formed from weighted combinations of terms, and documents with supra-threshold cosine values are returned. Initial tests find this completely automatic method for retrieval to be promising.}, number = {6}, journal = {Journal of the American Society for Information Science}, author = {Deerwester, S. and Dumais, S. T and Furnas, G. W and Landauer, T. K and Harshman, R.}, year = {1990}, pages = {391{\textendash}407} }, @article{steimann_type_2007, title = {Type Access Analysis: Towards Informed Interface Design}, volume = {6}, shorttitle = {Type Access Analysis}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.86.435}, abstract = {Programs designed from scratch often start with just a set of classes. Classes can be instantiated and so deliver the objects that are the carriers of information and function. In languages like Java and C++, classes also define types, so that they are sufficient to write a fully functional, type-checked program. Abstract classes and interfaces, which cannot be used for object creation, but which can serve to structure and decouple the code, are then either added later (as a result of refactoring) or never. One impediment to designing and introducing such type abstractions (generalizations) retroactively is that it is unclear how they can be used in a program, or what they should contain in order to be usable. However, this knowledge is, so we argue, completely contained in the program {\textemdash} it only needs to be unveiled. With our Type Access Analyzer {(TAA)} tool, we collect information useful for the design of type abstractions (abstract classes and interfaces) and their use, and present it to the developer for performing type-related refactorings in an informed manner.}, number = {9}, journal = {Journal of Object Technology}, author = {Steimann, Friedrich and Mayer, Philip}, month = oct, year = {2007}, keywords = {interfaces}, pages = {147--164} }, @article{lanza_categorization_2001, title = {A categorization of classes based on the visualization of their internal structure: the class blueprint}, volume = {36}, shorttitle = {A categorization of classes based on the visualization of their internal structure}, url = {http://portal.acm.org/citation.cfm?id=504311.504304}, doi = {10.1145/504311.504304}, abstract = {The reengineering and reverse engineering of software systems is gaining importance in software industry, because the accelerated turnover in software industry, because the accelerated turnover in software companies creates legacy systems in a shorter period of time. Especially understanding classes is a key activity in object-oriented programming, since classes represent the primary abstractions from which applications are built. The main problem of this task is to quickly grasp the purpose of a class and its inner structure. To help the reverse engineers in their first contact with a foreign system, we propose a categorization of classes based on the visualization of their internal structure. The contributions of this paper are a novel categorization of classes and a visualization of the which we call the class blueprint. We have validated the categorization on several case studies, two of which we present here.}, number = {11}, journal = {{SIGPLAN} Not.}, author = {Lanza, Michele and Ducasse, St\'{e}phane}, year = {2001}, keywords = {reverse engineering, visualization}, pages = {300--311} }, @inproceedings{evans_clone_2007, title = {Clone Detection via Structural Abstraction}, isbn = {1095-1350}, abstract = {This paper describes the design, implementation, and application of a new algorithm to detect cloned code. It operates on the abstract syntax trees formed by many compilers as an intermediate representation. It extends prior work by identifying clones even when arbitrary subtrees have been changed. On a 440,000-line code corpus, 20-50\% of the clones it detected were missed by previous methods. The method also identifies cloning in declarations, so it is somewhat more general than conventional procedural abstraction.}, booktitle = {Reverse Engineering, 2007. {WCRE} 2007. 14th Working Conference on}, author = {Evans, {W.S.} and Fraser, {C.W.} and Ma, Fei}, year = {2007}, keywords = {abstract syntax trees, clones}, pages = {150--159} }, @inproceedings{maentylae_bad_2004, address = {Los Alamitos, {CA}, {USA}}, title = {Bad Smells - Humans as Code Critics}, volume = {0}, doi = {http://doi.ieeecomputersociety.org/10.1109/ICSM.2004.1357825}, abstract = {This paper presents the results of an initial empirical study on the subjective evaluation of bad code smells, which identify poor structures in software. Based on a case study in a Finnish software product company, we make two contributions. First, we studied the evaluator effect when subjectively evaluating the existence of smells in code modules. We found that the use of smells for code evaluation purposes is hard due to conflicting perceptions of different evaluators. Second, we applied source code metrics for identifying three smells and compared these results to the subjective evaluations. Surprisingly, the metrics and smell evaluations did not correlate.}, booktitle = {Software Maintenance, {IEEE} International Conference on}, publisher = {{IEEE} Computer Society}, author = {M\"{a}ntyl\"{a}, Mika V. and Vanhanen, Jari and Lassenius, Casper}, year = {2004}, pages = {399--408}, annote = {Complete {PDF} document was either not available or accessible. Please make sure you're logged in to the digital library to retrieve the complete {PDF} document.} }, @article{kay_early_1993, title = {The early history of Smalltalk}, volume = {28}, url = {http://portal.acm.org/citation.cfm?id=155364}, doi = {10.1145/155360.155364}, abstract = {Most ideas come from previous ideas. The sixties, particularly in the {ARPA} community, gave rise to a host of notions about {\textquotedblleft}human-computer symbiosis{\textquotedblright} through interactive time-shared computers, graphics screens and pointing devices. Advanced computer languages were invented to simulate complex systems such as oil refineries and semi-intelligent behavior. The soon to follow paradigm shift of modern personal computing, overlapping window interfaces, and object-oriented design came from seeing the work of the sixties as something more than a {\textquotedblleft}better old thing{\textquotedblright}. That is, more than a better way: to do mainframe computing; for end-users to invoke functionality; to make data structures more abstract. Instead the promise of exponential growth in computing/\$/volume demanded that the sixties be regarded as {\textquotedblleft}almost a new thing{\textquotedblright} and to find out what the actual {\textquotedblleft}new things{\textquotedblright} might be. For example, one would compute with a handheld {{\textquotedblleft}Dynabook{\textquotedblright}} in a way that would not be possible on a shared mainframe; millions of potential users meant that the user interface would have to become a learning environment along the lines of Montessori and Bruner; and needs for large scope, reduction in complexity, and end-user literacy would require that data and control structures be done away with in favor of a more biological scheme of protected universal cells interacting only through messages that could mimic any desired behavior. Early Smalltalk was the first complete realization of these new points of view as parented by its many predecessors in hardware, language and user interface design. It became the exemplar of the new computing, in part, because we were actually trying for a qualitative shift in belief structures{\textemdash}a new Kuhnian paradigm in the same spirit as the invention of the printing press{\textemdash}and thus took highly extreme positions which almost forced these new styles to be invented.}, number = {3}, journal = {{SIGPLAN} Notices}, author = {Kay, Alan C.}, year = {1993}, keywords = {{OOP}}, pages = {69--95}, annote = {This paper talks about more than just Smalltalk. It is an interesting history of {OO} and the technological breakthroughs of the 60s and 70s. Relevance: 3} }, @article{bowman_solving_2010, title = {Solving the Class Responsibility Assignment Problem in Object-oriented Analysis with {Multi-Objective} Genetic Algorithms}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.164.571}, abstract = {In the context of object-oriented analysis and design {(OOAD)}, class responsibility assignment is not an easy skill to acquire. Though there are many methodologies for assigning responsibilities to classes, they all rely on human judgment and decision making. Our objective is to provide decision-making support to re-assign methods and attributes to classes in a class diagram. Our solution is based on a multi-objective genetic algorithm {(MOGA)} and uses class coupling and cohesion measurement for defining fitness functions. Our {MOGA} takes as input a class diagram to be optimized and suggests possible improvements to it. The choice of a {MOGA} stems from the fact that there are typically many evaluation criteria that cannot be easily combined into one objective, and several alternative solutions are acceptable for a given {OO} domain model. Using a carefully selected case study, this article investigates the application of our proposed {MOGA} to the class responsibility assignment problem, in the context of object-oriented analysis and domain class models. Our results suggest that the {MOGA} can help correct suboptimal class responsibility assignment decisions and perform far better than simpler alternative heuristics such as hill climbing and a single objective {GA.}}, number = {99}, journal = {{IEEE} Transactions on Software Engineering}, author = {Bowman, Michael and Briand, Lionel C and Labiche, Yvan}, month = jul, year = {2010}, keywords = {evolutionary algorithms, genetic algorithms, refactoring} }, @techreport{el-emam_object-oriented_2001, title = {{Object-Oriented} Metrics: A Review of Theory and Practice}, url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.92.4137&rep=rep1&type=pdf}, institution = {National Research Council Canada}, author = {{El-Emam}, K.}, year = {2001}, keywords = {empirical, maintainability, metrics} }, @inproceedings{jiang_framework_2007, title = {A Framework for Studying Clones In Large Software Systems}, isbn = {0-7695-2880-5}, url = {http://portal.acm.org/citation.cfm?id=1307349}, abstract = {Clones are code segments that have been created by copying-and-pasting from other code segments. Clones occur often in large software systems. It is reported that 5 to 50\% of the source code of a large software system is cloned. A major challenge when studying code cloning in large software systems is handling the large amount of clone candidates produced by leading edge clone detection tools. For example, the {CCFinder}, clone detection tool, produces over 7 million pairs of clone candidates for the Linux kernel (which consists of over 4 {MLOC).} Moreover, the output of clone detection tools grows rapidly as a software system evolves. Researchers and developers need tools to help them study the large amount of clone data in order to better understand the clone phenomena in large systems. In this paper, we propose a data mining framework to help researchers cope with the large amount of data produced by clone detection tools. We propose techniques to reduce, abstract and highlight the most interesting data produced by clone detection tools. Our framework also introduces a visualization tool which allows users to query and explore clone data at various abstraction levels. We demonstrate our framework on a case study of the clone phenomena in the Linux kernel.}, booktitle = {Proceedings of the Seventh {IEEE} International Working Conference on Source Code Analysis and Manipulation}, publisher = {{IEEE} Computer Society}, author = {Jiang, Zhen Ming and Hassan, Ahmed E.}, year = {2007}, keywords = {clone detection, clones, visualization}, pages = {203--212}, annote = {Surprisingly uninteresting. Talks about ways of filtering and visualizing results of a large system containing clones. Relevance: 2} }, @article{savi_characteristics_2011, title = {Characteristics of Class Collaboration Networks in Large Java Software Projects}, volume = {40}, issn = {{1392-124X}}, abstract = {Understanding software structural complexity and evolution plays an important role in controlling the software development and maintenance process. Recent studies have shown that the theory behind complex networks, especially the theory of scale-free networks, can be a useful approach to the analysis of concrete software systems. In this paper, class collaboration networks associated with five large Java software systems {(JDK}, Ant, Tomcat, Lucene and {JavaCC)} are analyzed in order to determine whether they belong to the class of scale-free networks, and examine their small-world characteristics. For each analyzed network, we detected (approximately) scale-free and (ultra) small-world properties. The results indicate that general conclusions from scale-free network theory can be applied to Java software systems in order to understand their structural complexity and model software evolution at the structural (class collaboration) level. Moreover, we examined class collaboration network evolution of Ant, in order to check the preferential attachment hypothesis of the {Barab?si-Albert} model. For several major Ant network transitions we concluded that preferential attachment can successfully model Ant evolution at the class collaboration level. Finally, we discuss the implications of our results on software engineering, in several aspects: identification of important classes/interfaces, software testing strategy, and efficient communication among software entities.}, number = {1}, journal = {Information Technology and Control}, author = {Savi\'{c}, M. and Ivanovi\'{c}, M. and Radovanovi\'{c}, M.}, year = {2011}, pages = {48--58} }, @article{cox_cohesion_2006, title = {Cohesion metric for object-oriented systems based on semantic closeness from disambiguity}, volume = {20}, doi = {10.1080/08839510500313687}, abstract = {Object-oriented semantic metrics address software quality by assessing underlying code meaning. Previous metrics were based on mapping a class{\textquoteright}s semantic information onto concepts in an application domain knowledge base. Quality measurements were made by operating on the concepts mapped onto. In this work, we consider more complex inter-concept relationships{\textemdash}semantic disambiguities through semantic connections. The idea is that a level of ambiguity is indicated by the connectivity within the knowledge base between two concepts. A cohesion metric based on this idea is shown to perform as well as traditional metrics, and is available much earlier in the development cycle.}, number = {5}, journal = {Applied Artificial Intelligence}, author = {Cox, Glenn W. and Etzkorn, Letha H. and Jr, William E. Hughes}, year = {2006}, keywords = {cohesion, kbs, metrics, ontologies, semantic networks, semantics}, pages = {419--436}, annote = {Discusses a semantics -based cohesion metric that depends on a domain-specific knowledge-base. Relevance: 4} }, @inproceedings{allen_measuring_2001, title = {Measuring coupling and cohesion o software modules: an information-theory approach}, isbn = {0-7695-1043-4}, shorttitle = {Measuring Coupling and Cohesion of Software Modules}, url = {http://portal.acm.org/citation.cfm?id=823994}, abstract = {Coupling of a subsystem characterizes its interdependence with other subsystems. A subsystem's cohesion, on the other hand, characterizes its internal interdependencies. When used in conjunction with other attributes, measurements of a subsystem's coupling and cohesion can contribute to software quality models. An abstraction of a software system can be represented by a graph and a module (subsystem) by a subgraph. Software-design graphs depict components and their relationships. Prior work by Allen and Khoshgoftaar proposed information theory-based measures of coupling and cohesion of a modular system. This paper proposes related information theory-based measures of coupling and cohesion of a module. These measures have the properties of module-level coupling and cohesion defined by Briand, Morasca, and Basili. We define cohesion of a module in terms of intramodule coupling, normalized to between zero and one. We illustrate the measures with example graphs and an empirical analysis of the call graph of a moderate-size C program, the Nethack computer game. Preliminary analysis showed that the information-theory approach has finer discrimination than counting.}, booktitle = {Proceedings of the 7th International Symposium on Software Metrics}, publisher = {{IEEE} Computer Society}, author = {Allen, Edward B. and Khoshgoftaar, Taghi M. and Chen, Ye}, year = {2001}, keywords = {call graph, cohesion, coupling, entropy, information theory, metrics}, pages = {124} }, @article{koschke_software_2003, title = {Software visualization in software maintenance, reverse engineering, and re-engineering: a research survey}, volume = {15}, issn = {{1532-060X}}, shorttitle = {Software visualization in software maintenance, reverse engineering, and re-engineering}, url = {http://portal.acm.org/beta/citation.cfm?id=859271.859274}, doi = {10.1002/smr.270}, number = {2}, journal = {Journal of Software Maintenance and Evolution: Research and Practice}, author = {Koschke, Rainer}, month = mar, year = {2003}, keywords = {maintenance, reengineering, reverse engineering, survey, visualization}, pages = {87--109} }, @misc{gamma_jhotdraw_2007, title = {{JHotDraw} Start Page}, url = {http://www.jhotdraw.org/}, author = {Gamma, Erich and Eggenschwiler, Thomas}, year = {2007}, note = {Accessed 2011-10-03}, howpublished = {http://www.jhotdraw.org/} }, @article{dietrich_towards_2007, title = {Towards a web of patterns}, volume = {5}, url = {http://portal.acm.org/citation.cfm?id=1265749}, abstract = {Design patterns have been used successfully in recent years in the software engineering community in order to share knowledge about the structural and behavioural properties of software. There is a growing body of research in the area of design pattern detection and design recovery, requiring a formal description of patterns which can be matched by tools against the software that is to be analysed. We propose a novel approach to the formal definition of design patterns that is based on the idea that design patterns are knowledge that is shared across a community and that is by nature distributed and inconsistent. By using the web ontology language {(OWL)} we are able to formally define design patterns and some related concepts such as pattern participant, pattern refinement, and pattern instance. We discuss the respective ontology and give examples of how patterns can be defined using this ontology. We present the prototype of a Java client that accesses the pattern definitions and detects patterns in Java software, and analyse some scan results. This leads to the discussion on design pattern instantiation.}, number = {2}, journal = {Journal of Web Semantics}, author = {Dietrich, Jens and Elgar, Chris}, year = {2007}, keywords = {design patterns, semantic web}, pages = {108--116} }, @article{kuhn_semantic_2007, title = {Semantic clustering: Identifying topics in source code}, volume = {49}, shorttitle = {Semantic clustering}, url = {http://portal.acm.org/citation.cfm?id=1224560.1224698}, abstract = {Many of the existing approaches in Software Comprehension focus on program structure or external documentation. However, by analyzing formal information the informal semantics contained in the vocabulary of source code are overlooked. To understand software as a whole, we need to enrich software analysis with the developer knowledge hidden in the code naming. This paper proposes the use of information retrieval to exploit linguistic information found in source code, such as identifier names and comments. We introduce Semantic Clustering, a technique based on Latent Semantic Indexing and clustering to group source artifacts that use similar vocabulary. We call these groups semantic clusters and we interpret them as linguistic topics that reveal the intention of the code. We compare the topics to each other, identify links between them, provide automatically retrieved labels, and use a visualization to illustrate how they are distributed over the system. Our approach is language independent as it works at the level of identifier names. To validate our approach we applied it on several case studies, two of which we present in this paper. Note: Some of the visualizations presented make heavy use of colors. Please obtain a color copy of the article for better understanding.}, number = {3}, journal = {Inf. Softw. Technol.}, author = {Kuhn, Adrian and Ducasse, St\'{e}phane and G\'{i}rba, Tudor}, year = {2007}, keywords = {clustering, latent semantic indexing, reverse engineering, software clustering}, pages = {230--243} }, @inproceedings{dagpinar_predicting_2003, title = {Predicting maintainability with object-oriented metrics - an empirical comparison}, isbn = {0-7695-2027-8}, url = {http://portal.acm.org/citation.cfm?id=951372}, abstract = {A large number of metrics have been proposed formeasuring properties of object-oriented software such assize, inheritance, cohesion and coupling. We have been investigating which of these object-oriented metrics can be used as significant predictors for the maintainability of software. For this purpose, we have designed and conducted an empirical study based on historical data collected from the maintenance history of a medium-sized object-oriented system. Unlike most related studies, indirect coupling has also been taken into account in our work in order to evaluate its impact. Our study uses the maintenance history of two software systems as evidence base for linking software quality attributes to metrics suggested for object-oriented software. Our results indicate that size and import direct coupling metrics are significant predictors for measuring maintainability of classes while inheritance, cohesion, and indirect/export coupling measures are not.}, booktitle = {Proceedings of the 10th Working Conference on Reverse Engineering}, publisher = {{IEEE} Computer Society}, author = {Dagpinar, Melis and Jahnke, Jens H.}, year = {2003}, keywords = {cohesion, coupling, empirical, maintainability, maintenance, metrics, size}, pages = {155}, annote = {Found no correlation between maintainability and cohesion {(LCC).} } }, @article{gansner_open_2000, title = {An open graph visualization system and its applications to software engineering}, volume = {30}, url = {http://dx.doi.org/10.1002/1097-024X(200009)30:11<1203::AID-SPE338>3.0.CO;2-N}, doi = {10.1002/1097-024X(200009)30:11<1203::AID-SPE338>3.0.CO;2-N}, abstract = {We describe a package of practical tools and libraries for manipulating graphs and their drawings. Our design, which is aimed at facilitating the combination of the package components with other tools, includes stream and event interfaces for graph operations, high-quality static and dynamic layout algorithms, and the ability to handle sizeable graphs. We conclude with a description of the applications of this package to a variety of software engineering tools. Copyright {\textcopyright} 2000 John Wiley \& Sons, Ltd.}, number = {11}, journal = {Software: Practice and Experience}, author = {Gansner, Emden R. and North, Stephen C.}, year = {2000}, keywords = {graphs, visualization}, pages = {1203--1233} }, @article{priss_formal_2006, title = {Formal concept analysis in information science}, volume = {40}, url = {http://dx.doi.org/10.1002/aris.1440400120}, doi = {10.1002/aris.1440400120}, number = {1}, journal = {Annual Review of Information Science and Technology}, author = {Priss, Uta}, year = {2006}, keywords = {{FCA}}, pages = {521--543} }, @inproceedings{moore_automatic_1996, address = {San Jose, California, United States}, title = {Automatic inheritance hierarchy restructuring and method refactoring}, isbn = {{0-89791-788-X}}, url = {http://portal.acm.org/citation.cfm?id=236361}, doi = {10.1145/236337.236361}, abstract = {Most, object-oriented programs have imperfectly designed inheritance hierarchies and imperfectly factored methods, and these imperfections tend to increase with maintenance. Hence, even object-oriented programs are more expensive to maintain, harder to understand and larger than necessary. Automatic restructuring of inheritance hierarchies and refactoring of methods can improve the design of inheritance hierarchies, and the factoring of methods. This results in programs being smaller, having better code re-use and being more consistent. This paper describes Guru, a prototype tool for automatic inheritance hierarchy restructuring and method refactoring of Self programs. Results from realistic applications of the tool are presented.}, booktitle = {Proceedings of the 11th {ACM} {SIGPLAN} conference on Object-oriented programming, systems, languages, and applications}, publisher = {{ACM}}, author = {Moore, Ivan}, year = {1996}, keywords = {inheritance, refactoring, restructuring}, pages = {235--250}, annote = {Self doesn't have static classes, so it's not clear how this applies to {OO} languages like C++ and Java. The approach is to flatten the inheritance hierarchy and then rebuild superclasses to optimally distribute reused methods. The authors point out that Guru will make the optimal allocation based on how the system currently exists, not on how it should have existed.} }, @article{allen_measuring_2007, title = {Measuring size, complexity, and coupling of hypergraph abstractions of software: An information-theory approach}, volume = {15}, shorttitle = {Measuring size, complexity, and coupling of hypergraph abstractions of software}, url = {http://portal.acm.org/citation.cfm?id=1232687&dl=GUIDE&coll=GUIDE&CFID=36587312&CFTOKEN=77502231}, abstract = {Software development is fundamentally based on cognitive processes. Our motivating hypothesis is that amounts of various kinds of information in software artifacts may have useful statistical relationships with software-engineering attributes. This paper proposes measures of size, complexity and coupling in terms of the amount of information, building on formal definitions of these software-metric families proposed by Briand, Morasca, and Basili.}, number = {2}, journal = {Software Quality Control}, author = {Allen, Edward B. and Gottipati, Sampath and Govindarajan, Rajiv}, year = {2007}, keywords = {complexity, coupling, entropy, information theory, measurement theory, metrics, size}, pages = {179--212} }, @article{al_dallal_design-based_2007, title = {A design-based cohesion metric for object-oriented classes}, volume = {1}, abstract = {Class cohesion is an important object-oriented software quality attribute. It indicates how much the members in a class are related. Assessing the class cohesion and improving the class quality accordingly during the object-oriented design phase allows for cheaper management of the later phases. In this paper, the notion of distance between pairs of methods and pairs of attribute types in a class is introduced and used as a basis for introducing a novel class cohesion metric. The metric considers the methodmethod, attribute-attribute, and attribute-method direct interactions. It is shown that the metric gives more sensitive values than other well-known design-based class cohesion metrics.}, number = {3}, journal = {International Journal of Computer Science and Engineering}, author = {Al Dallal, J.}, year = {2007}, keywords = {cohesion, distance metric, metrics, similarity}, pages = {195{\textendash}200} }, @article{bansiya_hierarchical_2002, title = {A hierarchical model for object-oriented design quality assessment}, volume = {28}, abstract = {This paper describes an improved hierarchical model for the assessment of high-level design quality attributes in objectoriented designs. In this model, structural and behavioral design properties of classes, objects, and their relationships are evaluated using a suite of object-oriented design metrics. This model relates design properties such as encapsulation, modularity, coupling, and cohesion to high-level quality attributes such as reusability, flexibility, and complexity using empirical and anecdotal information. The relationship, or links, from design properties to quality attributes are weighted in accordance with their influence and importance. The model is validated by using empirical and expert opinion to compare with the model results on several large commercial object-oriented systems. A key attribute of the model is that it can be easily modified to include different relationships and weights, thus providing a practical quality assessment tool adaptable to a variety of demands.}, number = {1}, journal = {{IEEE} {TRANSACTIONS} {ON} {SOFTWARE} {ENGINEERING}}, author = {Bansiya, Jagdish and Davis, Carl}, year = {2002}, keywords = {empirical, maintainability, metrics, metrics validation}, pages = {4--17} }, @inproceedings{serban_restructuring_2007, address = {Ankara, Turkey}, title = {Restructuring software systems using clustering}, abstract = {In this paper we are focusing on the problem of restructuring object oriented software systems using clustering techniques. Refactoring ([1]) is one major issue to improve the design of software systems, increasing the internal software quality. This paper aims at introducing a new k-medoids based clustering algorithm that can be used for improving the design of software systems, by identifying the needed refactorings. The algorithm uses a measure that evaluates a software system design. Clustering ([2]) is used in order to recondition the class structure of a software system. The proposed approach can be useful for assisting software engineers in their daily works of refactoring software systems. We evaluate our approach using the open source case study {JHotDraw} ([3]), illustrating the advantages of our approach in comparison with existing approaches.}, booktitle = {Proceedings of The 22th International Symposium on Computer and Information Sciences}, author = {Serban, G. and Czibula, {l.-G.}}, month = nov, year = {2007}, keywords = {clustering, graphs, maintenance, {OOP}, refactoring, software clustering}, pages = {1--6}, annote = {This discusses a variation of k-medoids to cluster attributes, methods, and classes based on cohesion. Interestingly, each object of these three entity types is treated similarly in that they have "property sets" that include objects the other two entity types as well as themselves. Following the clustering run, refactorings are proposed to convert the original system into the proposed one. Relevance: 5} }, @inproceedings{dietrich_formal_2005, address = {Washington, {DC}, {USA}}, title = {A Formal Description of Design Patterns Using {OWL}}, isbn = {0-7695-2257-2}, doi = {http://dx.doi.org/10.1109/ASWEC.2005.6}, booktitle = {{ASWEC} '05: Proceedings of the 2005 Australian conference on Software Engineering}, publisher = {{IEEE} Computer Society}, author = {Dietrich, Jens and Elgar, Chris}, year = {2005}, keywords = {design patterns, ontologies}, pages = {243{\textemdash}250}, annote = {Discusses a formal description of design patterns using the web ontology language {OWL.} An effective pattern scanner for the java language is based on the ontology. It uses reflection and {AST} analysis to verify constraints. Various applications of this scanner are discussed. Relevance: 5 } }, @inproceedings{xu_program_2004, title = {Program Restructuring Through Clustering Techniques}, isbn = {0-7695-2144-4}, abstract = {Program restructuring is a key method for improving the quality of ill-structured programs, thereby increasing the understandability and reducing the maintenance cost. It is a challenging task and a great deal of research is still ongoing. This paper presents an approach to program restructuring at the function level, based on clustering techniques with cohesion as the major concern. Clustering has been widely used to group related entities together. The approach focuses on automated support for identifying ill-structured or low-cohesive functions and providing heuristic advice in both the development and evolution phases. A new similarity measure is defined and studied intensively. The approach is applied to restructure a real industrial program. The empirical observations show that the heuristic advice provided by the approach can help software designers make better decision of why and how to restructure a program. Specific source code level software metrics are presented to demonstrate the value of the approach.}, booktitle = {Proceedings of the Source Code Analysis and Manipulation, Fourth {IEEE} International Workshop}, publisher = {{IEEE} Computer Society}, author = {Xu, Xia and Lung, {Chung-Horng} and Zaman, Marzia and Srinivasan, Anand}, year = {2004}, keywords = {clustering, refactoring, software clustering}, pages = {75--84}, annote = {http://portal.acm.org/citation.cfm?id=1022147} }, @inproceedings{wu_comparison_2005, title = {Comparison of Clustering Algorithms in the Context of Software Evolution}, isbn = {0-7695-2368-4}, url = {http://portal.acm.org/citation.cfm?id=1091881}, abstract = {To aid software analysis and maintenance tasks, a number of software clustering algorithms have been proposed to automatically partition a software system into meaningful subsystems or clusters. However, it is unknown whether these algorithms produce similar meaningful clusterings for similar versions of a real-life software system under continual change and growth. This paper describes a comparative study of six software clustering algorithms. We applied each of the algorithms to subsequent versions from five large open source systems. We conducted comparisons based on three criteria respectively: stability {(Does} the clustering change only modestly as the system undergoes modest updating?), authoritativeness {(Does} the clustering reasonably approximate the structure an authority provides?) and extremity of cluster distribution {(Does} the clustering avoid huge clusters and many very small clusters?). Experimental results indicate that the studied algorithms exhibit distinct characteristics. For example, the clusterings from the most stable algorithm bear little similarity to the implemented system structure, while the clusterings from the least stable algorithm has the best cluster distribution. Based on obtained results, we claim that current automatic clustering algorithms need significant improvement to provide continual support for large software projects.}, booktitle = {Proceedings of the 21st {IEEE} International Conference on Software Maintenance}, publisher = {{IEEE} Computer Society}, author = {Wu, Jingwei and Hassan, Ahmed E. and Holt, Richard C.}, year = {2005}, keywords = {clustering, software clustering, subsystem identification, survey}, pages = {525--535}, annote = {In summary, the 6 algorithms studied are not much good for identifying subsystems. Relevance: 3} }, @article{newman_scientific_2001, title = {Scientific collaboration networks. {II.} Shortest paths, weighted networks, and centrality.}, volume = {64}, issn = {1539-3755}, url = {http://view.ncbi.nlm.nih.gov/pubmed/11461356}, abstract = {Using computer databases of scientific papers in physics, biomedical research, and computer science, we have constructed networks of collaboration between scientists in each of these disciplines. In these networks two scientists are considered connected if they have coauthored one or more papers together. Here we study a variety of nonlocal statistics for these networks, such as typical distances between scientists through the network, and measures of centrality such as closeness and betweenness. We further argue that simple networks such as these cannot capture variation in the strength of collaborative ties and propose a measure of collaboration strength based on the number of papers coauthored by pairs of scientists, and the number of other scientists with whom they coauthored those papers.}, number = {1 Pt 2}, journal = {Phys Rev E Stat Nonlin Soft Matter Phys}, author = {Newman, {ME}}, month = jul, year = {2001}, keywords = {collaboration, graph algorithms, graphs, {SNA}} }, @book{henderson-sellers_object-oriented_1996, title = {Object-oriented metrics: measures of complexity}, isbn = {0-13-239872-9}, lccn = {{QA76.64} H4974 O}, shorttitle = {Object-oriented metrics}, url = {http://portal.acm.org/citation.cfm?id=229953}, publisher = {{Prentice-Hall}, Inc.}, author = {{Henderson-Sellers}, Brian}, year = {1996}, keywords = {cohesion, graphs, metrics}, annote = {This is dated as a reference. Nonetheless, it gives a good high-level explanation of the purpose of {OO} measures/metrics and analysis of strengths and weaknesses within them. Another of its strengths is its recognizance of semantics. It mentions cohesion in the context of a car\_person example. It also utilizes the idea of chunking. {LCOM*} defined on page 147. Relevance:4} }, @article{rui_xu_survey_2005, title = {Survey of clustering algorithms}, volume = {16}, issn = {1045-9227}, doi = {10.1109/TNN.2005.845141}, abstract = {Data analysis plays an indispensable role for understanding various phenomena. Cluster analysis, primitive exploration with little or no prior knowledge, consists of research developed across a wide variety of communities. The diversity, on one hand, equips us with many tools. On the other hand, the profusion of options causes confusion. We survey clustering algorithms for data sets appearing in statistics, computer science, and machine learning, and illustrate their applications in some benchmark data sets, the traveling salesman problem, and bioinformatics, a new field attracting intensive efforts. Several tightly related topics, proximity measure, and cluster validation, are also discussed.}, number = {3}, journal = {{IEEE} Transactions on Neural Networks}, author = {Rui Xu and Wunsch, D.}, month = may, year = {2005}, keywords = {Adaptive resonance theory {(ART)}, algorithms, Application software, benchmark data sets, Bioinformatics, cluster analysis, cluster validation, clustering, clustering algorithm, Clustering algorithms, Computer science, Computer Simulation, Data analysis, Humans, machine learning, Machine learning algorithms, Models, Statistical, neural networks, Neural Networks {(Computer)}, Numerical Analysis, {Computer-Assisted}, pattern classification, pattern clustering, Pattern Recognition, Automated, proximity, self-organizing feature map {(SOFM)}, Signal Processing, {Computer-Assisted}, Statistics, Stochastic Processes, traveling salesman problem, Traveling salesman problems}, pages = {645--678} }, @inproceedings{trifu_diagnosing_2005, address = {Pittsburgh, Pennsylvania}, title = {Diagnosing Design Problems in Object Oriented Systems}, isbn = {0-7695-2474-5}, url = {http://www2.computer.org/portal/web/csdl/doi/10.1109/WCRE.2005.15}, abstract = {Software decay is a phenomenon that plagues aging software systems. While in recent years, there has been significant progress in the area of automatic detection of {\textquotedblleft}code smells{\textquotedblright} on one hand, and code refactorings on the other hand, we claim that existing restructuring practices are seriously hampered by their symptomatic and informal (nonrepeatable) nature. This paper makes a clear distinction between structural problems and structural symptoms (also known as code smells), and presents a novel, causal approach to restructuring object oriented systems. Our approach is based on two innovations: the encapsulation of correlations of symptoms and additional contextual information into higher-level design problems, and the univocal, explicit mapping of problems to unique refactoring solutions. Due to its explicit, repeatable nature, the approach shows high potential for increased levels of automation in the restructuring process, and consequently a decrease in maintenance costs.}, booktitle = {Proceedings of the 12th Working Conference on Reverse Engineering {(WCRE} 2005)}, author = {Trifu, Adrian and Marinescu, Radu}, month = nov, year = {2005}, keywords = {refactoring, smells}, pages = {155--164}, annote = {This is a much shorter precursor to {"Object} Oriented Metrics in Practice". The book is the better reference.} }, @inproceedings{badri_towards_1995, address = {Versailles, France}, title = {Towards quality control metrics for object-oriented systems analysis}, booktitle = {{TOOLS} {(Technology} of {Object-Oriented} Languages and Systems)}, publisher = {{Prentice-Hall}}, author = {Badri, Linda and Badri, Mourad and Ferdenache, S.}, month = mar, year = {1995}, keywords = {cohesion} }, @inproceedings{henninger_ontology-based_2006, address = {San Francisco, {CA}, {USA}}, title = {An {Ontology-Based} Metamodel for Software Patterns}, isbn = {1-891706-18-7}, abstract = {Patterns have been successfully used in software design to reuse proven solutions. But the complex interconnections and the number of pattern collections is becoming a barrier for identifying relevant patterns and pattern combinations for a given design context. More formal representations of patterns are needed that allow machine processing and the creation of systematic pattern languages that guide composition of patterns into coherent design solutions. In this paper, we present a technique based on Description Logic and Semantic Web technologies to address these problems. A metamodel is presented for developing pattern languages using this technology. Usability patterns are used to demonstrate how this metamodel can be instantiated to form a pattern language for that domain. Our technique provides a computational basis for building intelligent tools that utilize patterns to support software development activities.}, booktitle = {Proceedings of the Eighteenth International Conference on Software Engineering \& Knowledge Engineering {(SEKE'2006)}}, author = {Henninger, Scott and Ashokkumar, Padmapriya}, month = jul, year = {2006}, keywords = {kbs, ontologies, {OWL}, patterns}, pages = {327--330} }, @article{farnstrom_scalability_2000, title = {Scalability for clustering algorithms revisited}, volume = {2}, issn = {19310145}, url = {http://dl.acm.org/citation.cfm?id=360419}, doi = {10.1145/360402.360419}, abstract = {This paper presents a simple new algorithm that performs k-means clustering in one scan of a dataset, while using a buffer for points from the dataset of fixed size. Experiments show that the new method is several times faster than standard k-means, and that it produces clusterings of equal or almost equal quality. The new method is a simplification of an algorithm due to Bradley, Fayyad and Reina that uses several data compression techniques in an attempt to improve speed and clustering quality. Unfortunately, the overhead of these techniques makes the original algorithm several times slower than standard k-means on materialized datasets, even though standard k-means scans a dataset multiple times. Also, lesion studies show that the compression techniques do not improve clustering quality. All results hold for 400 megabyte synthetic datasets and for a dataset created from the real-world data used in the 1998 {KDD} data mining contest. All algorithm implementations and experiments are designed so that results generalize to datasets of many gigabytes and larger.}, number = {1}, journal = {{ACM} {SIGKDD} Explorations Newsletter}, author = {Farnstrom, Fredrik and Lewis, James and Elkan, Charles}, month = jun, year = {2000}, keywords = {clustering, performance}, pages = {51--57} }, @article{angioni_integrating_2006, title = {Integrating {XP} project management in development environments}, volume = {52}, issn = {1383-7621}, url = {http://www.sciencedirect.com/science/article/B6V1F-4KKWW33-1/2/91c015e0f812b043d07463f2ab1058cd}, doi = {10.1016/j.sysarc.2006.06.006}, abstract = {Extreme Programming {(XP)} is an Agile Methodology {(AM)} which does not require any specific supporting tool for being successfully applied. Despite this starting observation, there are many reasons leading a {XP} team to adopt Web based tools to support {XP} practices. For example, such tools could be useful for process and product data collection and analysis or for supporting distributed development. In this article, we describe {XPSuite}, a tool composed of two parts: {XPSwiki}, a tool for managing {XP} projects and {XP4IDE}, a plug-in for integrating {XPSwiki} with an Integrated Development Environment {(IDE).} Moreover, we will show how the full Object Oriented implementation provides a powerful support for extracting all data represented in the model that the system implements.}, number = {11}, journal = {Journal of Systems Architecture}, author = {Angioni, M. and Carboni, D. and Pinna, S. and Sanna, R. and Serra, N. and Soro, A.}, month = nov, year = {2006}, keywords = {agile, Distributed development, {XP}}, pages = {619--626}, annote = {Uses "refactor mercilessly".} }, @incollection{brandes_experiments_2003, title = {Experiments on Graph Clustering Algorithms}, url = {http://www.springerlink.com/content/cv8tbjmrhdflau3r}, abstract = {A promising approach to graph clustering is based on the intuitive notion of intra-cluster density vs. inter-cluster sparsity. While both formalizations and algorithms focusing on particular aspects of this rather vague concept have been proposed no conclusive argument on their appropriateness has been given.}, booktitle = {Algorithms - {ESA} 2003}, author = {Brandes, Ulrik and Gaertler, Marco and Wagner, Dorothea}, year = {2003}, keywords = {clustering}, pages = {568--579} }, @techreport{bavota_two-step_2010-1, address = {Salerno, Italy}, type = {Technical report}, title = {A two-step technique for extract class refactoring}, url = {http://www.sesa.dmi.unisa.it/TR10 01.pdf}, abstract = {During software evolution the internal structure of the sys- tem undergoes continuous modi?cations. These continuous changes drift away the source code from its original design, reducing its quality, including class cohesion. Several refac- toring methods have been proposed to improve the cohe- sion of classes. In this paper we propose a novel approach supporting the Extract Class refactoring. The proposed ap- proach analyzes the (structural and semantic) similarity of the methods in a class in order to identify chains of strongly related methods. The identi?ed method chains are used to de?ne new classes with higher cohesion than the original class. The study has been conducted on two open-source software systems, namely {ArgoUML}, and {JHotDraw.} The results reveal that the proposed approach is able to identify meaningful refactoring operations and signi?cantly improves the cohesion of the refactored classes.}, number = {{TR10\_01}}, institution = {University of Salerno}, author = {Bavota, Gabriele and De Lucia, Andrea and Marcus, Andrian and Oliveto, Rocco}, year = {2010}, keywords = {cohesion, coupling, empirical, extract class, refactoring}, pages = {8} }, @inproceedings{churcher_visualising_2003, title = {Visualising class cohesion with virtual worlds}, isbn = {1-920682-03-1}, location = {Adelaide, Australia}, abstract = {An understanding of cohesion is an important factor in software design. However, cohesion is difficult to quantify, particularly for {OO}, and attempts to develop metrics have had limited success. We advocate the use of visualisation techniques to provide a richer view of cohesion than is possible with a single numeric value. In this paper we describe the application of {ANGLE} for {3D} graph layout and the use of {XSLT} transformations both to select the ingredients for visualisations and to determine their presentation details. We discuss our experiences with the use of virtual worlds as a presentation medium both on the desktop and in immersive environments and report early results from ongoing empirical work.}, booktitle = {Proceedings of the {Asia-Pacific} Symposium on Information Visualisation {(APVIS)}}, publisher = {Australian Computer Society, Inc}, author = {Churcher, Neville and Irwin, Warwick and Kriz, Ron}, year = {2003}, keywords = {cohesion, visualization}, pages = {89--97}, annote = {Draws {3D} graph structures of classes and notes the correspondence between {OO} cohesion and refactorings like extract class.} }, @techreport{simon_3d-spring_2000, title = {3d-spring embedder for complete graphs}, number = {11/00}, institution = {{BTU}, Inst. of Computer Science}, author = {Simon, F. and Steinbr\"{u}ckner, F. and Lewerentz, C.}, month = sep, year = {2000}, keywords = {graph layout, graphs} }, @article{dietterich_thomas_machine-learning_1997, title = {{Machine-Learning} Research}, volume = {18}, url = {https://www.aaai.org/ojs/index.php/aimagazine/article/viewArticle/1324}, abstract = {Machine-learning research has been making great progress in many directions. This article summarizes four of these directions and discusses some current open problems. The four directions are (1) the improvement of classification accuracy by learning ensembles of classifiers, (2) methods for scaling up supervised learning algorithms, (3) reinforcement learning, and (4) the learning of complex stochastic models.}, number = {4}, journal = {{AI} Magazine}, author = {Dietterich, Thomas}, month = dec, year = {1997}, keywords = {machine learning}, pages = {97--136}, annote = {Discusses how ensembles of classifiers can perform better than individual classifiers.} }, @inproceedings{ujhazi_new_2010, address = {Timisoara, Romania}, title = {New conceptual coupling and cohesion metrics for object-oriented systems}, abstract = {The paper presents two novel conceptual metrics for measuring coupling and cohesion in software systems. Our first metric, Conceptual Coupling between Object classes {(CCBO)}, is based on the well-known {CBO} coupling metric, while the other metric, Conceptual Lack of Cohesion on Methods {(CLCOM5)}, is based on the {LCOM5} cohesion metric. One advantage of the proposed conceptual metrics is that they can be computed in a simpler (and in many cases, programming language independent) way as compared to some of the structural metrics. We empirically studied {CCBO} and {CLCOM5} for predicting fault-proneness of classes in a large opensource system and compared these metrics with a host of existing structural and conceptual metrics for the same task. As the result, we found that the proposed conceptual metrics, when used in conjunction, can predict bugs nearly as precisely as the 58 structural metrics available in the Columbus source code quality framework and can be effectively combined with these metrics to improve bug prediction.}, booktitle = {Proc. of 10th {IEEE} International Working Conference on Source Code Analysis and Manipulation}, author = {\'{U}jh\'{a}zi, B. and Ferenc, R. and Poshyvanyk, D. and Gyim\'{o}thy, T.}, month = sep, year = {2010}, keywords = {cohesion, coupling, empirical, metrics, semantics} }, @techreport{alexander_analyzing_1986, title = {Analyzing the Run-time Behavior of Logic Programs}, number = {{MCC} Technical Report Number {DB-064-86}}, institution = {{MCC}}, author = {Alexander, William and Boughter, Ellen and Cassell, Keith and Keller, Tom}, year = {1986} }, @incollection{wagner_logic_2007, address = {Berlin, Heidelberg}, title = {A Logic Framework to Support Database Refactoring}, volume = {4653}, isbn = {978-3-540-74467-2, 978-3-540-74469-6}, url = {http://www.springerlink.com/content/d90682011k117251/}, booktitle = {Database and Expert Systems Applications}, publisher = {Springer Berlin Heidelberg}, author = {Chang, {Shi-Kuo} and Deufemia, Vincenzo and Polese, Giuseppe and Vacca, Mario}, editor = {Wagner, Roland and Revell, Norman and Pernul, G\"{u}nther}, year = {2007}, keywords = {database, refactoring}, pages = {509--518} }, @article{pan_class_2009, title = {Class structure refactoring of object-oriented softwares using community detection in dependency networks}, volume = {3}, issn = {1673-7350}, url = {http://www.springerlink.com/content/03512u800qpw4344/}, doi = {10.1007/s11704-009-0054-y}, abstract = {The quality of a software system is largely determined by its internal structures which always degrade over the software evolution. Therefore, the structures have to be reconditioned from time to time. However, the existing methods are very complex and resource-consuming when doing this task. In this paper, we present an approach to recondition the class structures of object-oriented {(OO)} software systems. It uses attribute-method networks and method-method networks to represent attributes, methods and dependencies between them; It proposes a guided community detection algorithm to obtain the optimized community structures in the method-method networks, which also correspond to the optimized class structures; It also provides a list of refactorings by comparing the optimized class structures with the real class structure in software systems and inspecting the attribute-method networks. The approach is evaluated using the open-source case study, {JHotDraw} 5.1, and the advantages of our approach are illustrated in comparison with existing methods.}, number = {3}, journal = {Frontiers of Computer Science in China}, author = {Pan, Weifeng and Li, Bing and Ma, Yutao and Liu, Jing and Qin, Yeyi}, month = aug, year = {2009}, keywords = {clustering, extract class, move method, refactoring, {SNA}, software clustering}, pages = {396--404}, annote = {Concentrates on move method. {"In} our approach, the algorithm travels through every method (node) with dependencies to other methods not defined in the same class (community), iteratively searches for the changes resulted from method-moving operation, and moves the method to the class that makes the largest increase in Q." Relevance: 4} }, @inproceedings{pedersen_wordnet::similarity:_2004, address = {Stroudsburg, {PA}, {USA}}, series = {{HLT-NAACL{\textendash}Demonstrations} '04}, title = {{WordNet::Similarity:} measuring the relatedness of concepts}, location = {Boston, Massachusetts}, url = {http://dl.acm.org/citation.cfm?id=1614025.1614037}, booktitle = {Demonstration Papers at {HLT-NAACL} 2004}, publisher = {Association for Computational Linguistics}, author = {Pedersen, Ted and Patwardhan, Siddharth and Michelizzi, Jason}, year = {2004}, pages = {38{\textendash}41} }, @article{cornelio_sound_2010, title = {Sound refactorings}, volume = {75}, issn = {0167-6423}, url = {http://www.sciencedirect.com/science/article/pii/S0167642309001300}, doi = {10.1016/j.scico.2009.10.001}, abstract = {Refactoring consists in restructuring an object-oriented program without changing its behaviour. In this paper, we present refactorings as transformation rules for programs written in a refinement language inspired on Java that allows reasoning about object-oriented programs and specifications. A set of programming laws is available for the imperative constructs of this language as well as for its object-oriented features; soundness of the laws is proved against a weakest precondition semantics. The proof that the refactoring rules preserve behaviour (semantics) is accomplished by the application of these programming laws and data simulation. As illustration of our approach to refactoring, we use our rules to restructure a program to be in accordance with a design pattern.}, number = {3}, journal = {Science of Computer Programming}, author = {Corn\'{e}lio, M\'{a}rcio and Cavalcanti, Ana and Sampaio, Augusto}, month = mar, year = {2010}, keywords = {Formal methods, refactoring, Refinement calculus}, pages = {106--133} }, @article{hastie_discriminant_1996, title = {Discriminant adaptive nearest neighbor classification}, volume = {18}, issn = {0162-8828}, doi = {10.1109/34.506411}, abstract = {Nearest neighbour classification expects the class conditional probabilities to be locally constant, and suffers from bias in high dimensions. We propose a locally adaptive form of nearest neighbour classification to try to ameliorate this curse of dimensionality. We use a local linear discriminant analysis to estimate an effective metric for computing neighbourhoods. We determine the local decision boundaries from centroid information, and then shrink neighbourhoods in directions orthogonal to these local decision boundaries, and elongate them parallel to the boundaries. Thereafter, any neighbourhood-based classifier can be employed, using the modified neighbourhoods. The posterior probabilities tend to be more homogeneous in the modified neighbourhoods. We also propose a method for global dimension reduction, that combines local dimension information. In a number of examples, the methods demonstrate the potential for substantial improvements over nearest neighbour classification}, number = {6}, journal = {Pattern Analysis and Machine Intelligence, {IEEE} Transactions on}, author = {Hastie, T. and Tibshirani, R.}, year = {1996}, keywords = {adaptive systems, approximation theory, centroid information, global dimension reduction, linear discriminant analysis, local decision boundaries, neighbourhood-based classifier, pattern classification, pattern recognition, probability}, pages = {607--616} }, @inproceedings{lincke_comparing_2008, address = {Seattle, Washington, {USA}}, title = {Comparing software metrics tools}, isbn = {978-1-60558-050-0}, abstract = {This paper shows that existing software metric tools interpret and implement the definitions of object-oriented software metrics differently. This delivers tool-dependent metrics results and has even implications on the results of analyses based on these metrics results. In short, the metrics based assessment of a software system and measures taken to improve its design differ considerably from tool to tool. To support our case, we conducted an experiment with a number of commercial and free metrics tools. We calculated metrics values using the same set of standard metrics for three software systems of different sizes. Measurements show that, for the same software system and metrics, the metrics values are tool depended. We also de?ned a (simple) software quality model for "maintainability" based on the metrics selected. It defines a ranking of the classes that are most critical wrt. maintainability. Measurements show that even the ranking of classes in a software system is metrics tool dependent.}, booktitle = {Proceedings of the 2008 International Symposium on Software Testing and Analysis}, author = {Lincke, R. and Lundberg, J. and L\"{o}we, W.}, year = {2008}, keywords = {metrics, metrics validation, survey, tools}, pages = {131--142}, annote = {Found huge amounts of inconsistency between metrics as measured by various tools.} }, @inproceedings{abreu_coupling-guided_2000, title = {A coupling-guided cluster analysis approach to reengineer the modularity of object-oriented systems}, doi = {10.1109/CSMR.2000.827300}, abstract = {Describes a validation experiment of a quantitative approach to the modularization of object-oriented systems. The approach used is based on cluster analysis, a statistical technique used in many fields of science to group items. In this case, the clusters are modules and the items are classes. A sample of some relatively large object-oriented systems was used in this experiment. The calculation of the dissimilarity between classes is based on their relative couplings combined through six different rating schemes. These couplings are classified according to a taxonomy framework where categories were assigned weights. The coupling data were obtained with the {MOODKit} G2 tool. The results obtained allow conclusions concerning the applicability of the proposed approach. This work was developed in the realm of the {MOOD} {(Modularization} of {Object-Oriented} Systems) project, which aims to deliver a quantitative framework to support the design of object-oriented systems}, booktitle = {Proceedings of the Fourth European Conf. Software Maintenance and Reengineering}, author = {Abreu, F. B. and Pereira, G. and Sousa, P.}, year = {2000}, keywords = {clustering, coupling, metrics, {OOP}, software clustering, statistical analysis, systems reengineering, taxonomy, validation}, pages = {13--22} }, @article{dig_how_2006, title = {How do {APIs} evolve? A story of refactoring: Research Articles}, volume = {18}, issn = {{1532-060X}}, url = {http://dl.acm.org/citation.cfm?id=1133105.1133107}, doi = {10.1002/smr.v18:2}, abstract = {Frameworks and libraries change their {APIs.} Migrating an application to the new {API} is tedious and disrupts the development process. Although some tools and ideas have been proposed to solve the evolution of {APIs}, most updates are done manually. To better understand the requirements for migration tools, we studied the {API} changes of four frameworks and one library. We discovered that the changes that break existing applications are not random, but tend to fall into particular categories. Over 80\% of these changes are refactorings. This suggests that refactoring-based migration tools should be used to update applications.}, number = {2}, journal = {J. Softw. Maint. Evol.}, author = {Dig, Danny and Johnson, Ralph}, month = mar, year = {2006}, keywords = {{API} evolution, component reuse, frameworks, libraries, refactoring}, pages = {83{\textendash}107} }, @inproceedings{yip_software_1994, title = {A software maintenance survey}, abstract = {We have conducted a survey of the state of software maintenance in Hong Kong, as the software industry in Hong Kong and south China is expanding. The survey instrument is derived from the previous work of Lientz and Swanson (1981) and Dekleva (1992). We sent out about 1000 questionnaires and received about 5\% replies. Our results indicate that, in Hong Kong, about 66\% of the total software life cycle cost is spent on software maintenance. The average application system is about 5 years old, consisting of 577 programs and 308000 lines of code. Making enhancements appears to be the most costly group of maintenance work (38\% of all maintenance work undertaken), followed by error correction (16\%). The most often cited problems in software maintenance are staff turnover, poor documentation and changing user requirements}, booktitle = {Proceedings First {Asia-Pacific} Software Engineering Conference}, author = {Yip, {S.W.L.} and Lam, T.}, year = {1994}, keywords = {maintenance}, pages = {70--79}, annote = {This is a general article about software maintenance. It is useful for background statistics. Relevance: 2 } }, @inproceedings{langelier_visualization-based_2005, address = {Long Beach, {CA}, {USA}}, title = {Visualization-based analysis of quality for large-scale software systems}, isbn = {1-59593-993-4}, url = {http://portal.acm.org/citation.cfm?id=1101941}, doi = {10.1145/1101908.1101941}, abstract = {We propose an approach for complex software analysis based on visualization. Our work is motivated by the fact that in spite of years of research and practice, software development and maintenance are still time and resource consuming, and high-risk activities. The most important reason in our opinion is the complexity of many phenomena related to software, such as its evolution and its reliability. In fact, there is very little theory explaining them. Today, we have a unique opportunity to empirically study these phenomena, thanks to large sets of software data available through open-source programs and open repositories. Automatic analysis techniques, such as statistics and machine learning, are usually limited when studying phenomena with unknown or poorly-understood influence factors. We claim that hybrid techniques that combine automatic analysis with human expertise through visualization are excellent alternatives to them. In this paper, we propose a visualization framework that supports quality analysis of large-scale software systems. We circumvent the problem of size by exploiting perception capabilities of the human visual system.}, booktitle = {Proceedings of the 20th {IEEE/ACM} international Conference on Automated software engineering}, publisher = {{ACM}}, author = {Langelier, Guillaume and Sahraoui, Houari and Poulin, Pierre}, year = {2005}, keywords = {visualization}, pages = {214--223} }, @article{baxter_understanding_2006, title = {Understanding the shape of Java software}, volume = {41}, url = {http://portal.acm.org/citation.cfm?id=1167507}, doi = {10.1145/1167515.1167507}, abstract = {Large amounts of Java software have been written since the language's escape into unsuspecting software ecology more than ten years ago. Surprisingly little is known about the structure of Java programs in the wild: about the way methods are grouped into classes and then into packages, the way packages relate to each other, or the way inheritance and composition are used to put these programs together. We present the results of the first in-depth study of the structure of Java programs. We have collected a number of Java programs and measured their key structural attributes. We have found evidence that some relationships follow power-laws, while others do not. We have also observed variations that seem related to some characteristic of the application itself. This study provides important information for researchers who can investigate how and why the structural relationships we find may have originated, what they portend, and how they can be managed.}, number = {10}, journal = {{ACM} {SIGPLAN} Notices}, author = {Baxter, Gareth and Frean, Marcus and Noble, James and Rickerby, Mark and Smith, Hayden and Visser, Matt and Melton, Hayden and Tempero, Ewan}, year = {2006}, keywords = {empirical, metrics, {OOD}, powerlaws}, pages = {397--412}, annote = {This is a paper that contains lots of metrics data on many different software packages. Relevance: 3} }, @article{newman_erratum:_2006, title = {Erratum: Scientific collaboration networks. {II.} Shortest paths, weighted networks, and centrality {[Phys.} Rev. E [bold 64], 016132 (2001)]}, volume = {73}, shorttitle = {Erratum}, url = {http://link.aps.org/abstract/PRE/v73/e039906}, doi = {10.1103/PhysRevE.73.039906}, abstract = {When there is more than one shortest path between a pair of vertices, the algorithm for the calculation of betweenness centrality presented in Section {IIB} does not always weight all paths equally as claimed. Instead it divides weights equally at each branch point along the path, which in some circumstances can result in slightly different values for the betweenness. Betweenness calculated in this latter fashion is sometimes called {\textquotedblleft}load{\textquotedblright}; see Goh et al. [1] for a discussion. A revised version of our algorithm that does weight all paths equally is given in [2,3].}, number = {3}, journal = {Physical Review E {(Statistical}, Nonlinear, and Soft Matter Physics)}, author = {Newman, M. E. J.}, month = mar, year = {2006}, keywords = {betweenness, clustering, graph algorithms, {SNA}}, pages = {039906--1}, annote = {References: [1] {K.-I.} Goh, E. Oh, H. Jeong, B. Kahng, and D. Kim, Proc. Natl. Acad. Sci. {U.S.A.} 99, 12583, 2000. [2] M. E. J. Newman and M. Girvan, Phys. Rev. E 69, 026113, 2004. [3] M. E. J. Newman, in Complex Networks, edited by E. {Ben-Naim}, H. Frauenfelder, and Z. Toroczkai {?Springer}, Berlin, 2004, No. 650 in Lecture Notes in Physics, pp. 337{\textendash}370.} }