18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Cell Broadband Engine OProfile Support 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * (C) Copyright IBM Corporation 2006 68c2ecf20Sopenharmony_ci * 78c2ecf20Sopenharmony_ci * Authors: Maynard Johnson <maynardj@us.ibm.com> 88c2ecf20Sopenharmony_ci * Carl Love <carll@us.ibm.com> 98c2ecf20Sopenharmony_ci */ 108c2ecf20Sopenharmony_ci 118c2ecf20Sopenharmony_ci#include <linux/hrtimer.h> 128c2ecf20Sopenharmony_ci#include <linux/smp.h> 138c2ecf20Sopenharmony_ci#include <linux/slab.h> 148c2ecf20Sopenharmony_ci#include <asm/cell-pmu.h> 158c2ecf20Sopenharmony_ci#include <asm/time.h> 168c2ecf20Sopenharmony_ci#include "pr_util.h" 178c2ecf20Sopenharmony_ci 188c2ecf20Sopenharmony_ci#define SCALE_SHIFT 14 198c2ecf20Sopenharmony_ci 208c2ecf20Sopenharmony_cistatic u32 *samples; 218c2ecf20Sopenharmony_ci 228c2ecf20Sopenharmony_ci/* spu_prof_running is a flag used to indicate if spu profiling is enabled 238c2ecf20Sopenharmony_ci * or not. It is set by the routines start_spu_profiling_cycles() and 248c2ecf20Sopenharmony_ci * start_spu_profiling_events(). The flag is cleared by the routines 258c2ecf20Sopenharmony_ci * stop_spu_profiling_cycles() and stop_spu_profiling_events(). These 268c2ecf20Sopenharmony_ci * routines are called via global_start() and global_stop() which are called in 278c2ecf20Sopenharmony_ci * op_powerpc_start() and op_powerpc_stop(). These routines are called once 288c2ecf20Sopenharmony_ci * per system as a result of the user starting/stopping oprofile. Hence, only 298c2ecf20Sopenharmony_ci * one CPU per user at a time will be changing the value of spu_prof_running. 308c2ecf20Sopenharmony_ci * In general, OProfile does not protect against multiple users trying to run 318c2ecf20Sopenharmony_ci * OProfile at a time. 328c2ecf20Sopenharmony_ci */ 338c2ecf20Sopenharmony_ciint spu_prof_running; 348c2ecf20Sopenharmony_cistatic unsigned int profiling_interval; 358c2ecf20Sopenharmony_ci 368c2ecf20Sopenharmony_ci#define NUM_SPU_BITS_TRBUF 16 378c2ecf20Sopenharmony_ci#define SPUS_PER_TB_ENTRY 4 388c2ecf20Sopenharmony_ci 398c2ecf20Sopenharmony_ci#define SPU_PC_MASK 0xFFFF 408c2ecf20Sopenharmony_ci 418c2ecf20Sopenharmony_ciDEFINE_SPINLOCK(oprof_spu_smpl_arry_lck); 428c2ecf20Sopenharmony_cistatic unsigned long oprof_spu_smpl_arry_lck_flags; 438c2ecf20Sopenharmony_ci 448c2ecf20Sopenharmony_civoid set_spu_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset) 458c2ecf20Sopenharmony_ci{ 468c2ecf20Sopenharmony_ci unsigned long ns_per_cyc; 478c2ecf20Sopenharmony_ci 488c2ecf20Sopenharmony_ci if (!freq_khz) 498c2ecf20Sopenharmony_ci freq_khz = ppc_proc_freq/1000; 508c2ecf20Sopenharmony_ci 518c2ecf20Sopenharmony_ci /* To calculate a timeout in nanoseconds, the basic 528c2ecf20Sopenharmony_ci * formula is ns = cycles_reset * (NSEC_PER_SEC / cpu frequency). 538c2ecf20Sopenharmony_ci * To avoid floating point math, we use the scale math 548c2ecf20Sopenharmony_ci * technique as described in linux/jiffies.h. We use 558c2ecf20Sopenharmony_ci * a scale factor of SCALE_SHIFT, which provides 4 decimal places 568c2ecf20Sopenharmony_ci * of precision. This is close enough for the purpose at hand. 578c2ecf20Sopenharmony_ci * 588c2ecf20Sopenharmony_ci * The value of the timeout should be small enough that the hw 598c2ecf20Sopenharmony_ci * trace buffer will not get more than about 1/3 full for the 608c2ecf20Sopenharmony_ci * maximum user specified (the LFSR value) hw sampling frequency. 618c2ecf20Sopenharmony_ci * This is to ensure the trace buffer will never fill even if the 628c2ecf20Sopenharmony_ci * kernel thread scheduling varies under a heavy system load. 638c2ecf20Sopenharmony_ci */ 648c2ecf20Sopenharmony_ci 658c2ecf20Sopenharmony_ci ns_per_cyc = (USEC_PER_SEC << SCALE_SHIFT)/freq_khz; 668c2ecf20Sopenharmony_ci profiling_interval = (ns_per_cyc * cycles_reset) >> SCALE_SHIFT; 678c2ecf20Sopenharmony_ci 688c2ecf20Sopenharmony_ci} 698c2ecf20Sopenharmony_ci 708c2ecf20Sopenharmony_ci/* 718c2ecf20Sopenharmony_ci * Extract SPU PC from trace buffer entry 728c2ecf20Sopenharmony_ci */ 738c2ecf20Sopenharmony_cistatic void spu_pc_extract(int cpu, int entry) 748c2ecf20Sopenharmony_ci{ 758c2ecf20Sopenharmony_ci /* the trace buffer is 128 bits */ 768c2ecf20Sopenharmony_ci u64 trace_buffer[2]; 778c2ecf20Sopenharmony_ci u64 spu_mask; 788c2ecf20Sopenharmony_ci int spu; 798c2ecf20Sopenharmony_ci 808c2ecf20Sopenharmony_ci spu_mask = SPU_PC_MASK; 818c2ecf20Sopenharmony_ci 828c2ecf20Sopenharmony_ci /* Each SPU PC is 16 bits; hence, four spus in each of 838c2ecf20Sopenharmony_ci * the two 64-bit buffer entries that make up the 848c2ecf20Sopenharmony_ci * 128-bit trace_buffer entry. Process two 64-bit values 858c2ecf20Sopenharmony_ci * simultaneously. 868c2ecf20Sopenharmony_ci * trace[0] SPU PC contents are: 0 1 2 3 878c2ecf20Sopenharmony_ci * trace[1] SPU PC contents are: 4 5 6 7 888c2ecf20Sopenharmony_ci */ 898c2ecf20Sopenharmony_ci 908c2ecf20Sopenharmony_ci cbe_read_trace_buffer(cpu, trace_buffer); 918c2ecf20Sopenharmony_ci 928c2ecf20Sopenharmony_ci for (spu = SPUS_PER_TB_ENTRY-1; spu >= 0; spu--) { 938c2ecf20Sopenharmony_ci /* spu PC trace entry is upper 16 bits of the 948c2ecf20Sopenharmony_ci * 18 bit SPU program counter 958c2ecf20Sopenharmony_ci */ 968c2ecf20Sopenharmony_ci samples[spu * TRACE_ARRAY_SIZE + entry] 978c2ecf20Sopenharmony_ci = (spu_mask & trace_buffer[0]) << 2; 988c2ecf20Sopenharmony_ci samples[(spu + SPUS_PER_TB_ENTRY) * TRACE_ARRAY_SIZE + entry] 998c2ecf20Sopenharmony_ci = (spu_mask & trace_buffer[1]) << 2; 1008c2ecf20Sopenharmony_ci 1018c2ecf20Sopenharmony_ci trace_buffer[0] = trace_buffer[0] >> NUM_SPU_BITS_TRBUF; 1028c2ecf20Sopenharmony_ci trace_buffer[1] = trace_buffer[1] >> NUM_SPU_BITS_TRBUF; 1038c2ecf20Sopenharmony_ci } 1048c2ecf20Sopenharmony_ci} 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_cistatic int cell_spu_pc_collection(int cpu) 1078c2ecf20Sopenharmony_ci{ 1088c2ecf20Sopenharmony_ci u32 trace_addr; 1098c2ecf20Sopenharmony_ci int entry; 1108c2ecf20Sopenharmony_ci 1118c2ecf20Sopenharmony_ci /* process the collected SPU PC for the node */ 1128c2ecf20Sopenharmony_ci 1138c2ecf20Sopenharmony_ci entry = 0; 1148c2ecf20Sopenharmony_ci 1158c2ecf20Sopenharmony_ci trace_addr = cbe_read_pm(cpu, trace_address); 1168c2ecf20Sopenharmony_ci while (!(trace_addr & CBE_PM_TRACE_BUF_EMPTY)) { 1178c2ecf20Sopenharmony_ci /* there is data in the trace buffer to process */ 1188c2ecf20Sopenharmony_ci spu_pc_extract(cpu, entry); 1198c2ecf20Sopenharmony_ci 1208c2ecf20Sopenharmony_ci entry++; 1218c2ecf20Sopenharmony_ci 1228c2ecf20Sopenharmony_ci if (entry >= TRACE_ARRAY_SIZE) 1238c2ecf20Sopenharmony_ci /* spu_samples is full */ 1248c2ecf20Sopenharmony_ci break; 1258c2ecf20Sopenharmony_ci 1268c2ecf20Sopenharmony_ci trace_addr = cbe_read_pm(cpu, trace_address); 1278c2ecf20Sopenharmony_ci } 1288c2ecf20Sopenharmony_ci 1298c2ecf20Sopenharmony_ci return entry; 1308c2ecf20Sopenharmony_ci} 1318c2ecf20Sopenharmony_ci 1328c2ecf20Sopenharmony_ci 1338c2ecf20Sopenharmony_cistatic enum hrtimer_restart profile_spus(struct hrtimer *timer) 1348c2ecf20Sopenharmony_ci{ 1358c2ecf20Sopenharmony_ci ktime_t kt; 1368c2ecf20Sopenharmony_ci int cpu, node, k, num_samples, spu_num; 1378c2ecf20Sopenharmony_ci 1388c2ecf20Sopenharmony_ci if (!spu_prof_running) 1398c2ecf20Sopenharmony_ci goto stop; 1408c2ecf20Sopenharmony_ci 1418c2ecf20Sopenharmony_ci for_each_online_cpu(cpu) { 1428c2ecf20Sopenharmony_ci if (cbe_get_hw_thread_id(cpu)) 1438c2ecf20Sopenharmony_ci continue; 1448c2ecf20Sopenharmony_ci 1458c2ecf20Sopenharmony_ci node = cbe_cpu_to_node(cpu); 1468c2ecf20Sopenharmony_ci 1478c2ecf20Sopenharmony_ci /* There should only be one kernel thread at a time processing 1488c2ecf20Sopenharmony_ci * the samples. In the very unlikely case that the processing 1498c2ecf20Sopenharmony_ci * is taking a very long time and multiple kernel threads are 1508c2ecf20Sopenharmony_ci * started to process the samples. Make sure only one kernel 1518c2ecf20Sopenharmony_ci * thread is working on the samples array at a time. The 1528c2ecf20Sopenharmony_ci * sample array must be loaded and then processed for a given 1538c2ecf20Sopenharmony_ci * cpu. The sample array is not per cpu. 1548c2ecf20Sopenharmony_ci */ 1558c2ecf20Sopenharmony_ci spin_lock_irqsave(&oprof_spu_smpl_arry_lck, 1568c2ecf20Sopenharmony_ci oprof_spu_smpl_arry_lck_flags); 1578c2ecf20Sopenharmony_ci num_samples = cell_spu_pc_collection(cpu); 1588c2ecf20Sopenharmony_ci 1598c2ecf20Sopenharmony_ci if (num_samples == 0) { 1608c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&oprof_spu_smpl_arry_lck, 1618c2ecf20Sopenharmony_ci oprof_spu_smpl_arry_lck_flags); 1628c2ecf20Sopenharmony_ci continue; 1638c2ecf20Sopenharmony_ci } 1648c2ecf20Sopenharmony_ci 1658c2ecf20Sopenharmony_ci for (k = 0; k < SPUS_PER_NODE; k++) { 1668c2ecf20Sopenharmony_ci spu_num = k + (node * SPUS_PER_NODE); 1678c2ecf20Sopenharmony_ci spu_sync_buffer(spu_num, 1688c2ecf20Sopenharmony_ci samples + (k * TRACE_ARRAY_SIZE), 1698c2ecf20Sopenharmony_ci num_samples); 1708c2ecf20Sopenharmony_ci } 1718c2ecf20Sopenharmony_ci 1728c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&oprof_spu_smpl_arry_lck, 1738c2ecf20Sopenharmony_ci oprof_spu_smpl_arry_lck_flags); 1748c2ecf20Sopenharmony_ci 1758c2ecf20Sopenharmony_ci } 1768c2ecf20Sopenharmony_ci smp_wmb(); /* insure spu event buffer updates are written */ 1778c2ecf20Sopenharmony_ci /* don't want events intermingled... */ 1788c2ecf20Sopenharmony_ci 1798c2ecf20Sopenharmony_ci kt = profiling_interval; 1808c2ecf20Sopenharmony_ci if (!spu_prof_running) 1818c2ecf20Sopenharmony_ci goto stop; 1828c2ecf20Sopenharmony_ci hrtimer_forward(timer, timer->base->get_time(), kt); 1838c2ecf20Sopenharmony_ci return HRTIMER_RESTART; 1848c2ecf20Sopenharmony_ci 1858c2ecf20Sopenharmony_ci stop: 1868c2ecf20Sopenharmony_ci printk(KERN_INFO "SPU_PROF: spu-prof timer ending\n"); 1878c2ecf20Sopenharmony_ci return HRTIMER_NORESTART; 1888c2ecf20Sopenharmony_ci} 1898c2ecf20Sopenharmony_ci 1908c2ecf20Sopenharmony_cistatic struct hrtimer timer; 1918c2ecf20Sopenharmony_ci/* 1928c2ecf20Sopenharmony_ci * Entry point for SPU cycle profiling. 1938c2ecf20Sopenharmony_ci * NOTE: SPU profiling is done system-wide, not per-CPU. 1948c2ecf20Sopenharmony_ci * 1958c2ecf20Sopenharmony_ci * cycles_reset is the count value specified by the user when 1968c2ecf20Sopenharmony_ci * setting up OProfile to count SPU_CYCLES. 1978c2ecf20Sopenharmony_ci */ 1988c2ecf20Sopenharmony_ciint start_spu_profiling_cycles(unsigned int cycles_reset) 1998c2ecf20Sopenharmony_ci{ 2008c2ecf20Sopenharmony_ci ktime_t kt; 2018c2ecf20Sopenharmony_ci 2028c2ecf20Sopenharmony_ci pr_debug("timer resolution: %lu\n", TICK_NSEC); 2038c2ecf20Sopenharmony_ci kt = profiling_interval; 2048c2ecf20Sopenharmony_ci hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 2058c2ecf20Sopenharmony_ci hrtimer_set_expires(&timer, kt); 2068c2ecf20Sopenharmony_ci timer.function = profile_spus; 2078c2ecf20Sopenharmony_ci 2088c2ecf20Sopenharmony_ci /* Allocate arrays for collecting SPU PC samples */ 2098c2ecf20Sopenharmony_ci samples = kcalloc(SPUS_PER_NODE * TRACE_ARRAY_SIZE, sizeof(u32), 2108c2ecf20Sopenharmony_ci GFP_KERNEL); 2118c2ecf20Sopenharmony_ci 2128c2ecf20Sopenharmony_ci if (!samples) 2138c2ecf20Sopenharmony_ci return -ENOMEM; 2148c2ecf20Sopenharmony_ci 2158c2ecf20Sopenharmony_ci spu_prof_running = 1; 2168c2ecf20Sopenharmony_ci hrtimer_start(&timer, kt, HRTIMER_MODE_REL); 2178c2ecf20Sopenharmony_ci schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE); 2188c2ecf20Sopenharmony_ci 2198c2ecf20Sopenharmony_ci return 0; 2208c2ecf20Sopenharmony_ci} 2218c2ecf20Sopenharmony_ci 2228c2ecf20Sopenharmony_ci/* 2238c2ecf20Sopenharmony_ci * Entry point for SPU event profiling. 2248c2ecf20Sopenharmony_ci * NOTE: SPU profiling is done system-wide, not per-CPU. 2258c2ecf20Sopenharmony_ci * 2268c2ecf20Sopenharmony_ci * cycles_reset is the count value specified by the user when 2278c2ecf20Sopenharmony_ci * setting up OProfile to count SPU_CYCLES. 2288c2ecf20Sopenharmony_ci */ 2298c2ecf20Sopenharmony_civoid start_spu_profiling_events(void) 2308c2ecf20Sopenharmony_ci{ 2318c2ecf20Sopenharmony_ci spu_prof_running = 1; 2328c2ecf20Sopenharmony_ci schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE); 2338c2ecf20Sopenharmony_ci 2348c2ecf20Sopenharmony_ci return; 2358c2ecf20Sopenharmony_ci} 2368c2ecf20Sopenharmony_ci 2378c2ecf20Sopenharmony_civoid stop_spu_profiling_cycles(void) 2388c2ecf20Sopenharmony_ci{ 2398c2ecf20Sopenharmony_ci spu_prof_running = 0; 2408c2ecf20Sopenharmony_ci hrtimer_cancel(&timer); 2418c2ecf20Sopenharmony_ci kfree(samples); 2428c2ecf20Sopenharmony_ci pr_debug("SPU_PROF: stop_spu_profiling_cycles issued\n"); 2438c2ecf20Sopenharmony_ci} 2448c2ecf20Sopenharmony_ci 2458c2ecf20Sopenharmony_civoid stop_spu_profiling_events(void) 2468c2ecf20Sopenharmony_ci{ 2478c2ecf20Sopenharmony_ci spu_prof_running = 0; 2488c2ecf20Sopenharmony_ci} 249