diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.c b/drivers/staging/rdma/hfi1/user_exp_rcv.c index 776ce003248e..d33f579675b7 100644 --- a/drivers/staging/rdma/hfi1/user_exp_rcv.c +++ b/drivers/staging/rdma/hfi1/user_exp_rcv.c @@ -97,8 +97,7 @@ struct tid_pageset { static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *, struct rb_root *); -static u32 find_phys_blocks(struct page **, unsigned, - struct tid_pageset *) __maybe_unused; +static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *); static int set_rcvarray_entry(struct file *, unsigned long, u32, struct tid_group *, struct page **, unsigned); static inline int mmu_addr_cmp(struct mmu_rb_node *, unsigned long, @@ -119,7 +118,7 @@ static inline void mmu_notifier_range_start(struct mmu_notifier *, unsigned long, unsigned long); static int program_rcvarray(struct file *, unsigned long, struct tid_group *, struct tid_pageset *, unsigned, u16, struct page **, - u32 *, unsigned *, unsigned *) __maybe_unused; + u32 *, unsigned *, unsigned *); static int unprogram_rcvarray(struct file *, u32, struct tid_group **); static void clear_tid_node(struct hfi1_filedata *, u16, struct mmu_rb_node *); @@ -339,9 +338,265 @@ static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index) writeq(0, dd->rcvarray_wc + (index * 8)); } +/* + * RcvArray entry allocation for Expected Receives is done by the + * following algorithm: + * + * The context keeps 3 lists of groups of RcvArray entries: + * 1. List of empty groups - tid_group_list + * This list is created during user context creation and + * contains elements which describe sets (of 8) of empty + * RcvArray entries. + * 2. List of partially used groups - tid_used_list + * This list contains sets of RcvArray entries which are + * not completely used up. Another mapping request could + * use some of all of the remaining entries. + * 3. List of full groups - tid_full_list + * This is the list where sets that are completely used + * up go. + * + * An attempt to optimize the usage of RcvArray entries is + * made by finding all sets of physically contiguous pages in a + * user's buffer. + * These physically contiguous sets are further split into + * sizes supported by the receive engine of the HFI. The + * resulting sets of pages are stored in struct tid_pageset, + * which describes the sets as: + * * .count - number of pages in this set + * * .idx - starting index into struct page ** array + * of this set + * + * From this point on, the algorithm deals with the page sets + * described above. The number of pagesets is divided by the + * RcvArray group size to produce the number of full groups + * needed. + * + * Groups from the 3 lists are manipulated using the following + * rules: + * 1. For each set of 8 pagesets, a complete group from + * tid_group_list is taken, programmed, and moved to + * the tid_full_list list. + * 2. For all remaining pagesets: + * 2.1 If the tid_used_list is empty and the tid_group_list + * is empty, stop processing pageset and return only + * what has been programmed up to this point. + * 2.2 If the tid_used_list is empty and the tid_group_list + * is not empty, move a group from tid_group_list to + * tid_used_list. + * 2.3 For each group is tid_used_group, program as much as + * can fit into the group. If the group becomes fully + * used, move it to tid_full_list. + */ int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo) { - return -EINVAL; + int ret = 0, need_group = 0, pinned; + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + unsigned npages, ngroups, pageidx = 0, pageset_count, npagesets, + tididx = 0, mapped, mapped_pages = 0; + unsigned long vaddr = tinfo->vaddr; + struct page **pages = NULL; + u32 *tidlist = NULL; + struct tid_pageset *pagesets = NULL; + + /* Get the number of pages the user buffer spans */ + npages = num_user_pages(vaddr, tinfo->length); + if (!npages) + return -EINVAL; + + if (npages > uctxt->expected_count) { + dd_dev_err(dd, "Expected buffer too big\n"); + return -EINVAL; + } + + /* Verify that access is OK for the user buffer */ + if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, + npages * PAGE_SIZE)) { + dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n", + (void *)vaddr, npages); + return -EFAULT; + } + + pagesets = kcalloc(uctxt->expected_count, sizeof(*pagesets), + GFP_KERNEL); + if (!pagesets) + return -ENOMEM; + + /* Allocate the array of struct page pointers needed for pinning */ + pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + goto bail; + } + + /* + * Pin all the pages of the user buffer. If we can't pin all the + * pages, accept the amount pinned so far and program only that. + * User space knows how to deal with partially programmed buffers. + */ + pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages); + if (pinned <= 0) { + ret = pinned; + goto bail; + } + + /* Find sets of physically contiguous pages */ + npagesets = find_phys_blocks(pages, pinned, pagesets); + + /* + * We don't need to access this under a lock since tid_used is per + * process and the same process cannot be in hfi1_user_exp_rcv_clear() + * and hfi1_user_exp_rcv_setup() at the same time. + */ + spin_lock(&fd->tid_lock); + if (fd->tid_used + npagesets > fd->tid_limit) + pageset_count = fd->tid_limit - fd->tid_used; + else + pageset_count = npagesets; + spin_unlock(&fd->tid_lock); + + if (!pageset_count) + goto bail; + + ngroups = pageset_count / dd->rcv_entries.group_size; + tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL); + if (!tidlist) { + ret = -ENOMEM; + goto nomem; + } + + tididx = 0; + + /* + * From this point on, we are going to be using shared (between master + * and subcontexts) context resources. We need to take the lock. + */ + mutex_lock(&uctxt->exp_lock); + /* + * The first step is to program the RcvArray entries which are complete + * groups. + */ + while (ngroups && uctxt->tid_group_list.count) { + struct tid_group *grp = + tid_group_pop(&uctxt->tid_group_list); + + ret = program_rcvarray(fp, vaddr, grp, pagesets, + pageidx, dd->rcv_entries.group_size, + pages, tidlist, &tididx, &mapped); + /* + * If there was a failure to program the RcvArray + * entries for the entire group, reset the grp fields + * and add the grp back to the free group list. + */ + if (ret <= 0) { + tid_group_add_tail(grp, &uctxt->tid_group_list); + hfi1_cdbg(TID, + "Failed to program RcvArray group %d", ret); + goto unlock; + } + + tid_group_add_tail(grp, &uctxt->tid_full_list); + ngroups--; + pageidx += ret; + mapped_pages += mapped; + } + + while (pageidx < pageset_count) { + struct tid_group *grp, *ptr; + /* + * If we don't have any partially used tid groups, check + * if we have empty groups. If so, take one from there and + * put in the partially used list. + */ + if (!uctxt->tid_used_list.count || need_group) { + if (!uctxt->tid_group_list.count) + goto unlock; + + grp = tid_group_pop(&uctxt->tid_group_list); + tid_group_add_tail(grp, &uctxt->tid_used_list); + need_group = 0; + } + /* + * There is an optimization opportunity here - instead of + * fitting as many page sets as we can, check for a group + * later on in the list that could fit all of them. + */ + list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list, + list) { + unsigned use = min_t(unsigned, pageset_count - pageidx, + grp->size - grp->used); + + ret = program_rcvarray(fp, vaddr, grp, pagesets, + pageidx, use, pages, tidlist, + &tididx, &mapped); + if (ret < 0) { + hfi1_cdbg(TID, + "Failed to program RcvArray entries %d", + ret); + ret = -EFAULT; + goto unlock; + } else if (ret > 0) { + if (grp->used == grp->size) + tid_group_move(grp, + &uctxt->tid_used_list, + &uctxt->tid_full_list); + pageidx += ret; + mapped_pages += mapped; + need_group = 0; + /* Check if we are done so we break out early */ + if (pageidx >= pageset_count) + break; + } else if (WARN_ON(ret == 0)) { + /* + * If ret is 0, we did not program any entries + * into this group, which can only happen if + * we've screwed up the accounting somewhere. + * Warn and try to continue. + */ + need_group = 1; + } + } + } +unlock: + mutex_unlock(&uctxt->exp_lock); +nomem: + hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx, + mapped_pages, ret); + if (tididx) { + spin_lock(&fd->tid_lock); + fd->tid_used += tididx; + spin_unlock(&fd->tid_lock); + tinfo->tidcnt = tididx; + tinfo->length = mapped_pages * PAGE_SIZE; + + if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist, + tidlist, sizeof(tidlist[0]) * tididx)) { + /* + * On failure to copy to the user level, we need to undo + * everything done so far so we don't leak resources. + */ + tinfo->tidlist = (unsigned long)&tidlist; + hfi1_user_exp_rcv_clear(fp, tinfo); + tinfo->tidlist = 0; + ret = -EFAULT; + goto bail; + } + } + + /* + * If not everything was mapped (due to insufficient RcvArray entries, + * for example), unpin all unmapped pages so we can pin them nex time. + */ + if (mapped_pages != pinned) + hfi1_release_user_pages(&pages[mapped_pages], + pinned - mapped_pages, + false); +bail: + kfree(pagesets); + kfree(pages); + kfree(tidlist); + return ret > 0 ? 0 : ret; } int hfi1_user_exp_rcv_clear(struct file *fp, struct hfi1_tid_info *tinfo)