diff --git a/drivers/gpu/drm/xe/tests/xe_migrate.c b/drivers/gpu/drm/xe/tests/xe_migrate.c index 7a32faa2f688..a6523df0f1d3 100644 --- a/drivers/gpu/drm/xe/tests/xe_migrate.c +++ b/drivers/gpu/drm/xe/tests/xe_migrate.c @@ -331,7 +331,7 @@ static void xe_migrate_sanity_test(struct xe_migrate *m, struct kunit *test) xe_res_first_sg(xe_bo_sg(pt), 0, pt->size, &src_it); emit_pte(m, bb, NUM_KERNEL_PDE - 1, xe_bo_is_vram(pt), false, - &src_it, XE_PAGE_SIZE, pt); + &src_it, XE_PAGE_SIZE, pt->ttm.resource); run_sanity_job(m, xe, bb, bb->len, "Writing PTE for our fake PT", test); diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index 02fca8f9adc2..e05e9e7282b6 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -62,6 +62,8 @@ struct xe_migrate { * out of the pt_bo. */ struct drm_suballoc_manager vm_update_sa; + /** @min_chunk_size: For dgfx, Minimum chunk size */ + u64 min_chunk_size; }; #define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */ @@ -363,6 +365,19 @@ struct xe_migrate *xe_migrate_init(struct xe_tile *tile) if (err) return ERR_PTR(err); + if (IS_DGFX(xe)) { + if (xe_device_has_flat_ccs(xe)) + /* min chunk size corresponds to 4K of CCS Metadata */ + m->min_chunk_size = SZ_4K * SZ_64K / + xe_device_ccs_bytes(xe, SZ_64K); + else + /* Somewhat arbitrary to avoid a huge amount of blits */ + m->min_chunk_size = SZ_64K; + m->min_chunk_size = roundup_pow_of_two(m->min_chunk_size); + drm_dbg(&xe->drm, "Migrate min chunk size is 0x%08llx\n", + (unsigned long long)m->min_chunk_size); + } + return m; } @@ -374,16 +389,35 @@ static u64 max_mem_transfer_per_pass(struct xe_device *xe) return MAX_PREEMPTDISABLE_TRANSFER; } -static u64 xe_migrate_res_sizes(struct xe_device *xe, struct xe_res_cursor *cur) +static u64 xe_migrate_res_sizes(struct xe_migrate *m, struct xe_res_cursor *cur) { - /* - * For VRAM we use identity mapped pages so we are limited to current - * cursor size. For system we program the pages ourselves so we have no - * such limitation. - */ - return min_t(u64, max_mem_transfer_per_pass(xe), - mem_type_is_vram(cur->mem_type) ? cur->size : - cur->remaining); + struct xe_device *xe = tile_to_xe(m->tile); + u64 size = min_t(u64, max_mem_transfer_per_pass(xe), cur->remaining); + + if (mem_type_is_vram(cur->mem_type)) { + /* + * VRAM we want to blit in chunks with sizes aligned to + * min_chunk_size in order for the offset to CCS metadata to be + * page-aligned. If it's the last chunk it may be smaller. + * + * Another constraint is that we need to limit the blit to + * the VRAM block size, unless size is smaller than + * min_chunk_size. + */ + u64 chunk = max_t(u64, cur->size, m->min_chunk_size); + + size = min_t(u64, size, chunk); + if (size > m->min_chunk_size) + size = round_down(size, m->min_chunk_size); + } + + return size; +} + +static bool xe_migrate_allow_identity(u64 size, const struct xe_res_cursor *cur) +{ + /* If the chunk is not fragmented, allow identity map. */ + return cur->size >= size; } static u32 pte_update_size(struct xe_migrate *m, @@ -396,7 +430,12 @@ static u32 pte_update_size(struct xe_migrate *m, u32 cmds = 0; *L0_pt = pt_ofs; - if (!is_vram) { + if (is_vram && xe_migrate_allow_identity(*L0, cur)) { + /* Offset into identity map. */ + *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile), + cur->start + vram_region_gpu_offset(res)); + cmds += cmd_size; + } else { /* Clip L0 to available size */ u64 size = min(*L0, (u64)avail_pts * SZ_2M); u64 num_4k_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE); @@ -412,11 +451,6 @@ static u32 pte_update_size(struct xe_migrate *m, /* Each chunk has a single blit command */ cmds += cmd_size; - } else { - /* Offset into identity map. */ - *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile), - cur->start + vram_region_gpu_offset(res)); - cmds += cmd_size; } return cmds; @@ -426,10 +460,10 @@ static void emit_pte(struct xe_migrate *m, struct xe_bb *bb, u32 at_pt, bool is_vram, bool is_comp_pte, struct xe_res_cursor *cur, - u32 size, struct xe_bo *bo) + u32 size, struct ttm_resource *res) { struct xe_device *xe = tile_to_xe(m->tile); - + struct xe_vm *vm = m->q->vm; u16 pat_index; u32 ptes; u64 ofs = at_pt * XE_PAGE_SIZE; @@ -442,13 +476,6 @@ static void emit_pte(struct xe_migrate *m, else pat_index = xe->pat.idx[XE_CACHE_WB]; - /* - * FIXME: Emitting VRAM PTEs to L0 PTs is forbidden. Currently - * we're only emitting VRAM PTEs during sanity tests, so when - * that's moved to a Kunit test, we should condition VRAM PTEs - * on running tests. - */ - ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE); while (ptes) { @@ -468,20 +495,22 @@ static void emit_pte(struct xe_migrate *m, addr = xe_res_dma(cur) & PAGE_MASK; if (is_vram) { - /* Is this a 64K PTE entry? */ - if ((m->q->vm->flags & XE_VM_FLAG_64K) && - !(cur_ofs & (16 * 8 - 1))) { - xe_tile_assert(m->tile, IS_ALIGNED(addr, SZ_64K)); + if (vm->flags & XE_VM_FLAG_64K) { + u64 va = cur_ofs * XE_PAGE_SIZE / 8; + + xe_assert(xe, (va & (SZ_64K - 1)) == + (addr & (SZ_64K - 1))); + flags |= XE_PTE_PS64; } - addr += vram_region_gpu_offset(bo->ttm.resource); + addr += vram_region_gpu_offset(res); devmem = true; } - addr = m->q->vm->pt_ops->pte_encode_addr(m->tile->xe, - addr, pat_index, - 0, devmem, flags); + addr = vm->pt_ops->pte_encode_addr(m->tile->xe, + addr, pat_index, + 0, devmem, flags); bb->cs[bb->len++] = lower_32_bits(addr); bb->cs[bb->len++] = upper_32_bits(addr); @@ -693,8 +722,8 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m, bool usm = xe->info.has_usm; u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; - src_L0 = xe_migrate_res_sizes(xe, &src_it); - dst_L0 = xe_migrate_res_sizes(xe, &dst_it); + src_L0 = xe_migrate_res_sizes(m, &src_it); + dst_L0 = xe_migrate_res_sizes(m, &dst_it); drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n", pass++, src_L0, dst_L0); @@ -715,6 +744,7 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m, &ccs_ofs, &ccs_pt, 0, 2 * avail_pts, avail_pts); + xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE)); } /* Add copy commands size here */ @@ -727,20 +757,20 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m, goto err_sync; } - if (!src_is_vram) - emit_pte(m, bb, src_L0_pt, src_is_vram, true, &src_it, src_L0, - src_bo); - else + if (src_is_vram && xe_migrate_allow_identity(src_L0, &src_it)) xe_res_next(&src_it, src_L0); - - if (!dst_is_vram) - emit_pte(m, bb, dst_L0_pt, dst_is_vram, true, &dst_it, src_L0, - dst_bo); else + emit_pte(m, bb, src_L0_pt, src_is_vram, true, &src_it, src_L0, + src); + + if (dst_is_vram && xe_migrate_allow_identity(src_L0, &dst_it)) xe_res_next(&dst_it, src_L0); + else + emit_pte(m, bb, dst_L0_pt, dst_is_vram, true, &dst_it, src_L0, + dst); if (copy_system_ccs) - emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src_bo); + emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src); bb->cs[bb->len++] = MI_BATCH_BUFFER_END; update_idx = bb->len; @@ -949,7 +979,7 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m, bool usm = xe->info.has_usm; u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; - clear_L0 = xe_migrate_res_sizes(xe, &src_it); + clear_L0 = xe_migrate_res_sizes(m, &src_it); drm_dbg(&xe->drm, "Pass %u, size: %llu\n", pass++, clear_L0); @@ -976,12 +1006,12 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m, size -= clear_L0; /* Preemption is enabled again by the ring ops. */ - if (!clear_vram) { - emit_pte(m, bb, clear_L0_pt, clear_vram, true, &src_it, clear_L0, - bo); - } else { + if (clear_vram && xe_migrate_allow_identity(clear_L0, &src_it)) xe_res_next(&src_it, clear_L0); - } + else + emit_pte(m, bb, clear_L0_pt, clear_vram, true, &src_it, clear_L0, + dst); + bb->cs[bb->len++] = MI_BATCH_BUFFER_END; update_idx = bb->len;